In [4]:
import pandas as pd
import os
import json
import preprocessor as p

from datetime import datetime, timedelta

In [2]:
def get_location_dataframe(tweet_df, column_name):
    location_df = pd.DataFrame()

    location_df[column_name + '_country_code'] = tweets_df[column_name].apply(lambda x: x.get('country_code'))
    location_df[column_name + '_state'] = tweets_df[column_name].apply(lambda x: x.get('state'))
    location_df[column_name + '_county'] = tweets_df[column_name].apply(lambda x: x.get('county'))
    location_df[column_name + '_city'] = tweets_df[column_name].apply(lambda x: x.get('city'))

    return location_df
    

In [6]:
extracted_tweets_files_dir = 'extracted_tweets_files'
filtered_tweets_files_dir = 'filtered_tweets_files'
hydrated_tweets_files_dir = 'hydrated_tweets_files'
cleaned_tweets_files_dir = 'cleaned_tweets_files'

tweet_file_name_template = 'en_geo_{}.{}'
extracted_tweets_file_type = 'json'
filtered_tweets_file_type = 'txt'
cleaned_tweets_file_type = 'csv'

extracted_tweet_file_name = tweet_file_name_template.format('2020-02-01', extracted_tweets_file_type)
extracted_tweets_file = os.path.join(extracted_tweets_files_dir, extracted_tweet_file_name)

filtered_tweet_file_name = tweet_file_name_template.format('2020-02-01', filtered_tweets_file_type)
filtered_tweets_file = os.path.join(filtered_tweets_files_dir, filtered_tweet_file_name)

hydrated_tweet_file_name = tweet_file_name_template.format('2020-02-01', filtered_tweets_file_type)
hydrated_tweets_file = os.path.join(hydrated_tweets_files_dir, hydrated_tweet_file_name)

cleaned_tweet_file_name = tweet_file_name_template.format('2020-02-01', cleaned_tweets_file_type)
cleaned_tweets_file = os.path.join(cleaned_tweets_files_dir, cleaned_tweet_file_name)

print(extracted_tweets_file)
print(filtered_tweets_file)
print(hydrated_tweets_file)
print(cleaned_tweets_file)

extracted_tweets_files\en_geo_2020-02-01.json
filtered_tweets_files\en_geo_2020-02-01.txt
hydrated_tweets_files\en_geo_2020-02-01.txt
cleaned_tweets_files\en_geo_2020-02-01.csv


In [21]:
with open(extracted_tweets_file) as f:
    tweets_df = pd.read_json(f, lines = True)

In [22]:
user_location_df = get_location_dataframe(tweets_df, 'user_location')
geo_df = get_location_dataframe(tweets_df, 'geo')
place_df = get_location_dataframe(tweets_df, 'place')

refined_tweets_df = pd.concat([tweets_df, user_location_df, geo_df, place_df], axis = 1)

In [23]:
refined_tweets_df['tweet_locations_country_code'] = refined_tweets_df['tweet_locations']\
.apply(lambda x: list(map(lambda y: y.get('country_code'), x)))

refined_tweets_df['tweet_locations_state'] = refined_tweets_df['tweet_locations']\
.apply(lambda x: list(map(lambda y: y.get('state'), x)))

refined_tweets_df['tweet_locations_county'] = refined_tweets_df['tweet_locations']\
.apply(lambda x: list(map(lambda y: y.get('county'), x)))

refined_tweets_df['tweet_locations_city'] = refined_tweets_df['tweet_locations']\
.apply(lambda x: list(map(lambda y: y.get('city'), x)))

In [24]:
refined_tweets_df['is_user_india_based'] = refined_tweets_df['user_location_country_code']\
.apply(lambda x: 1 if x == 'in' else 0)

refined_tweets_df['is_tweet_locations_inc_india'] = refined_tweets_df['tweet_locations_country_code']\
.apply(lambda x: 1 if 'in' in x else 0)

In [25]:
refined_tweets_df.rename(columns = {'user_location': 'user_location_raw', 'geo': 'geo_raw', 
                                    'place': 'place_raw',
                                    'tweet_locations':'tweet_locations_raw'}, inplace = True)

In [26]:
india_tweets_df = refined_tweets_df.loc[refined_tweets_df['is_tweet_locations_inc_india'] == 1]

In [27]:
india_tweets_df.head()

Unnamed: 0,created_at,geo_raw,geo_source,place_raw,tweet_id,tweet_locations_raw,user_id,user_location_raw,user_location_country_code,user_location_state,...,place_country_code,place_state,place_county,place_city,tweet_locations_country_code,tweet_locations_state,tweet_locations_county,tweet_locations_city,is_user_india_based,is_tweet_locations_inc_india
14,2020-02-01 06:14:49,{},tweet_text,{},1223489863552421888,"[{'country_code': 'fr', 'state': 'Provence-Alp...",955970244,{},,,...,,,,,"[fr, gb, us, in, ru, gb]","[Provence-Alpes-Côte d'Azur, England, Kentucky...","[Gap, Gloucestershire, Franklin County, None, ...","[Gap, None, None, None, None, None]",0,1
31,2020-02-01 22:04:52,{},tweet_text,{},1223728952692396032,"[{'country_code': 'in'}, {'country_code': 'cn'...",39795026,{},,,...,,,,,"[in, cn, in]","[None, Hubei, Delhi]","[None, Jiang'an District, None]","[None, Wuhan, Delhi]",0,1
36,2020-02-01 22:06:26,{},user_location,{},1223729344432001024,"[{'country_code': 'in'}, {'country_code': 'cn'...",3014309837,{'country_code': 'fi'},fi,,...,,,,,"[in, cn, in]","[None, Hubei, Delhi]","[None, Jiang'an District, None]","[None, Wuhan, Delhi]",0,1
41,2020-02-01 22:07:15,{},user_location,{},1223729550053584896,"[{'country_code': 'in'}, {'country_code': 'cn'...",211273540,"{'country_code': 'fr', 'state': 'Ile-de-France...",fr,Ile-de-France,...,,,,,"[in, cn, in]","[None, Hubei, Delhi]","[None, Jiang'an District, None]","[None, Wuhan, Delhi]",0,1
65,2020-02-01 06:18:41,{},tweet_text,{},1223490838589632512,"[{'country_code': 'ca', 'state': 'Newfoundland...",760700423200247808,{},,,...,,,,,"[ca, in, om, dk, ee, ye, ir, iq, ro, hr, fr, p...","[Newfoundland and Labrador, None, Ad Dakhiliya...","[None, None, None, Favrskov Municipality, Saar...","[None, None, None, None, None, None, None, Non...",0,1


In [28]:
def create_dir(dir_path):
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

In [29]:
create_dir(filtered_tweets_files_dir)
india_tweets_df.to_csv(filtered_tweets_file, columns=['tweet_id'], header=False, index=False)

In [30]:
create_dir(hydrated_tweets_files_dir)
os.system('twarc hydrate ' + filtered_tweets_file + '> ' + hydrated_tweets_file)

0

In [29]:
with open(hydrated_tweets_file) as f:
    hydrated_tweets_df = pd.read_json(f, lines = True)

In [30]:
hydrated_tweets_df.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,favorite_count,favorited,full_text,geo,...,retweet_count,retweeted,retweeted_status,scopes,source,truncated,user,withheld_copyright,withheld_in_countries,withheld_scope
0,,,2020-02-01 04:49:03,"[0, 74]","{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,"RT @AzChike: I respect rob, nigga love his hoo...",,...,14149,False,{'created_at': 'Fri Jan 31 21:09:11 +0000 2020...,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 3016455522, 'id_str': '3016455522', 'na...",,,
1,,,2020-02-01 04:55:33,"[0, 73]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1223088161053691904, 'id_str...",0,False,"RT @Ls_Taniaa: Dioss mioo, 1000 puntos en seri...",,...,11050,False,{'created_at': 'Fri Jan 31 03:38:41 +0000 2020...,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 2737765863, 'id_str': '2737765863', 'na...",,,
2,,,2020-02-01 04:56:05,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,RT @ANI: Air India Spokesperson: Another Air I...,,...,57,False,{'created_at': 'Sat Feb 01 04:16:15 +0000 2020...,,"<a href=""http://twitter.com/download/android"" ...",False,"{'id': 65591610, 'id_str': '65591610', 'name':...",,,
3,,,2020-02-01 04:58:16,"[0, 147]","{'hashtags': [{'text': 'coronavirus', 'indices...","{'media': [{'id': 1223470542914588674, 'id_str...",1,False,#coronavirus: Air India flight lands in New De...,,...,0,False,,,"<a href=""http://twitter.com/download/android"" ...",False,"{'id': 1068117508501065728, 'id_str': '1068117...",,,
4,,,2020-02-01 04:59:04,"[0, 281]","{'hashtags': [], 'symbols': [], 'user_mentions...",,3,False,The potential for 2nd &amp; 3rd order effects ...,,...,0,False,,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 374870617, 'id_str': '374870617', 'name...",,,


In [31]:
hydrated_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26642 entries, 0 to 26641
Data columns (total 35 columns):
contributors                 0 non-null float64
coordinates                  4 non-null object
created_at                   26642 non-null datetime64[ns]
display_text_range           26642 non-null object
entities                     26642 non-null object
extended_entities            1493 non-null object
favorite_count               26642 non-null int64
favorited                    26642 non-null bool
full_text                    26642 non-null object
geo                          4 non-null object
id                           26642 non-null int64
id_str                       26642 non-null int64
in_reply_to_screen_name      1166 non-null object
in_reply_to_status_id        1089 non-null float64
in_reply_to_status_id_str    1089 non-null float64
in_reply_to_user_id          1166 non-null float64
in_reply_to_user_id_str      1166 non-null float64
is_quote_status              26642

In [38]:
merged_tweets_df = india_tweets_df.merge(hydrated_tweets_df.rename(columns = {'id':'tweet_id'}), 
                                                                              how='left', on='tweet_id')

In [39]:
merged_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53909 entries, 0 to 53908
Data columns (total 60 columns):
created_at_x                    53909 non-null datetime64[ns]
geo_raw                         53909 non-null object
geo_source                      53909 non-null object
place_raw                       53909 non-null object
tweet_id                        53909 non-null int64
tweet_locations_raw             53909 non-null object
user_id                         53909 non-null int64
user_location_raw               53909 non-null object
user_location_country_code      29155 non-null object
user_location_state             15084 non-null object
user_location_county            8766 non-null object
user_location_city              6948 non-null object
geo_country_code                8 non-null object
geo_state                       8 non-null object
geo_county                      4 non-null object
geo_city                        7 non-null object
place_country_code              181 non

In [9]:
summary_df = pd.DataFrame(columns = )

In [41]:
create_dir(merged_tweets_files_dir)
merged_tweets_df.to_csv(merged_tweets_file, index=False)

In [10]:
summary_df.head()

Unnamed: 0,date,total_tweets,tweets_india,tweets_outside_india,user_india,user_outside_india,tweet_with_full_text,tweet_without_full_text


In [7]:
cleaned_tweets_df = pd.read_csv(cleaned_tweets_file)

In [10]:
cleaned_tweets_df.loc[0,'full_text']

'normala sister talkin of viability gap funding , ppp for creating more hospitals , medical backbone but government of india 🇮🇳 asking &amp; sister orga with scarce resources to prepare to battle #coronavirus #budget2020 10s of orgs set up for emergencies 🤣😭'

In [20]:
p.set_options(p.OPT.RESERVED, p.OPT.MENTION, p.OPT.URL)
p.tokenize(p.clean(cleaned_tweets_df.loc[0,'full_text'].lower()))

'normala sister talkin of viability gap funding , ppp for creating more hospitals , medical backbone but government of india 🇮🇳 asking &amp; sister orga with scarce resources to prepare to battle #coronavirus #budget2020 10s of orgs set up for emergencies 🤣😭'

In [10]:
cleaned_tweets_df.loc[:,'full_text'].apply(lambda x: p.clean(x))

0        Normala Sister talkin of viability gap funding...
1        : Air Indias carrying evacuees from Wuhan has ...
2        : Indian already airlifted. This is india. Ism...
3        : One takeaway from the coronavirus? Dont puni...
4        : An Air India flight carrying Indian national...
5        : Coronavirus: Air India plane evacuates India...
6        : Dear all, We have created a dedicated URL fo...
7        : Relieved to see AI return safely to with of ...
8        : India expresses gratitude to China for aidin...
9        : I Was Literally Waiting For This: Swami Chak...
10                                                       :
11       : the flu: *results in hospitalizations and de...
12       : Everytime you abuse for its deficient servic...
13       : the flu: *results in hospitalizations and de...
14       : the flu: *results in hospitalizations and de...
15       : Air India special flight to evacuate Indian ...
16       : the flu: *results in hospitalizations and de.

In [25]:
from clean_tweet import CleanTweet
clean_tweet = CleanTweet()

cleaned_tweets_df.loc[:, 'full_text']

0        Normala Sister talkin of viability gap funding...
1        : Air India’s 747 carrying evacuees from Wuhan...
2        : Indian already airlifted. This is india. Ism...
3        : One takeaway from the coronavirus? Don’t pun...
4        : An Air India flight carrying 324 Indian nati...
5        : Coronavirus: Air India plane evacuates 324 I...
6        : Dear all, We have created a dedicated URL fo...
7        : Relieved to see AI 1349 return safely to #De...
8        : India expresses gratitude to China for aidin...
9        : I Was Literally Waiting For This: #Indian Sw...
10       : #nsfw #xxx #freeporn #hdporn #livesex #porn ...
11       : the flu: *results in 500,000 hospitalization...
12       : Everytime you abuse for its deficient servic...
13       : the flu: *results in 500,000 hospitalization...
14       : the flu: *results in 500,000 hospitalization...
15       : Air India special flight to evacuate Indian ...
16       : the flu: *results in 500,000 hospitalization.