In [59]:
import os
import pandas as pd
import dask.dataframe as dd
import random
import json
import emoji
random.seed(493)


In [98]:
def process_raw_data(path, name):
    if not os.path.exists(f"./data/{name}.pkl"):
        import dask
        import json
        import dask.dataframe as dd
        import dask.bag as db
        from dask.distributed import Client, progress
        client = Client(n_workers=4, threads_per_worker=1)
        print(client, client.dashboard_link)
        b = db.read_text(f'{path}/x**').map(json.loads)

        flatten = lambda rec: {
                    'full_text': rec['full_text'],
                    'created_at': rec['created_at'],
                    'id': rec['id'],
                    'lon': rec['coordinates']['coordinates'][0],
                    'lat': rec['coordinates']['coordinates'][1]}

        tweets_df = b.filter(lambda record: record['coordinates'] is not None)
        tweets_df = tweets_df.map(flatten).to_dataframe().compute()
        print(f"saving ./data/{name}.pkl")
        tweets_df.drop_duplicates(subset=['id']).to_pickle(f"./data/{name}.pkl")
    else:
        tweets_df = pd.read_pickle(f"./data/{name}.pkl").drop_duplicates(subset=['id'])
    return tweets_df


all_tweets_df = process_raw_data("./data/big_data/sandy_data/", 'big')
all_tweets_df.head()

Unnamed: 0,full_text,created_at,id,lon,lat
0,So there's going to be a hurricane tonight and...,Thu Oct 25 19:18:02 +0000 2012,261547183722082300,-80.062265,26.844194
1,They named the hurricane that will be coming u...,Thu Oct 25 19:18:42 +0000 2012,261547349921370100,-74.54917,40.618274
2,@Pototo_28 LMFAO!!!! I'm quite jealous that a ...,Thu Oct 25 19:18:46 +0000 2012,261547369261301760,-80.327574,25.870439
3,So we're suppose to get a hurricane Monday...,Thu Oct 25 19:19:06 +0000 2012,261547450463051780,-72.901798,40.912246
4,I'm pumped for this hurricane,Thu Oct 25 19:19:36 +0000 2012,261547576652881920,-74.100116,40.855067


In [99]:
def preprocess(text):
    import preprocessor as p
    import html
    from multiprocessing.dummy import Pool as ThreadPool    
    p.set_options(p.OPT.ESCAPE_CHAR, p.OPT.URL)
    pool = ThreadPool()
    results = pool.map(lambda d : p.clean(html.unescape(d)), text)
    pool.close()
    pool.join()
    return results

all_tweets_df['full_text'] = preprocess(list(all_tweets_df['full_text'].values))
all_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79681 entries, 0 to 551
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   full_text   79681 non-null  object 
 1   created_at  79681 non-null  object 
 2   id          79681 non-null  int64  
 3   lon         79681 non-null  float64
 4   lat         79681 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.6+ MB


In [100]:
all_tweets_df['full_text'].sample(5).values

array(['@irinatag Yup', '@iSlapThirsties did the hurricane hit yet ?',
       'Jangan sampe "@monicaanggi_: mau ada badai sandy kali bang ._. RT @Poconggg: Ini kenapa Bandung malah jadi kayak mau badai gini..',
       'We can see a bright side in the darkness that was #Sandy. There will be many new opportunities for jobs for many American sectors.',
       '@LITeaParty @toddschnitt @seanhannity 4 the record, Pres. Bush had NOTHING 2 do with Hurricane Sandy! #obamadeception'],
      dtype=object)

In [101]:
def get_inexact_location(latlong):
    import reverse_geocoder as rg
    results = rg.search(latlong)
    return pd.DataFrame(results)
locs = get_inexact_location(list(zip(all_tweets_df['lat'], all_tweets_df['lon'])))
locs

Unnamed: 0,lat,lon,name,admin1,admin2,cc
0,26.81756,-80.08199,North Palm Beach,Florida,Palm Beach County,US
1,40.60121,-74.55905,Martinsville,New Jersey,Somerset County,US
2,25.8651,-80.3245,Hialeah Gardens,Florida,Miami-Dade County,US
3,40.89399,-72.89594,Ridge,New York,Suffolk County,US
4,40.85316,-74.11375,Wallington,New Jersey,Bergen County,US
...,...,...,...,...,...,...
79676,40.194,-74.04875,Shark River Hills,New Jersey,Monmouth County,US
79677,10.48801,-66.87919,Caracas,Capital,Municipio Libertador,VE
79678,36.23708,-79.97948,Stokesdale,North Carolina,Guilford County,US
79679,6.13328,102.2386,Kota Bharu,Kelantan,,MY


In [105]:
all_tweets_df = all_tweets_df.merge(locs, left_index=True, right_index=True).reset_index(drop=True)
# all_tweets_df.to_pickle("./data/cleaned.pkl")

In [106]:
all_tweets_df = pd.read_pickle('./data/cleaned.pkl')


In [107]:
emoji_df = pd.read_pickle("./data/bigmoji.pkl")

In [108]:
codes = json.load(open("./torchMoji/data/emoji_codes.json"))
emoji2sentdf = pd.read_html("http://kt.ijs.si/data/Emoji_sentiment_ranking/", )[0]

In [109]:
num2emoji = {int(num): emoji.emojize(ji, use_aliases=True) for num, ji in codes.items()}
emoji2sent = pd.Series(emoji2sentdf['Sentiment score[-1...+1]'].values,index=emoji2sentdf['Char'].values).to_dict()



In [110]:
emoji_cols = [f'Emoji_{i}' for i in range(1,5)]

In [111]:
for i in range(1, 6):
    emoji_df[f'Emoji_{i}'] = emoji_df[f'Emoji_{i}'].map(num2emoji)
    emoji_df[f'Sent_{i}'] = emoji_df[f'Emoji_{i}'].map(emoji2sent)


In [113]:
# all_tweets_df

In [114]:
all_tweets_df.merge(emoji_df, left_on='full_text', right_on='Text', how='left').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93890 entries, 0 to 93889
Data columns (total 34 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   full_text   93890 non-null  object 
 1   created_at  93890 non-null  object 
 2   id          93890 non-null  int64  
 3   lon_x       93890 non-null  float64
 4   lat_x       93890 non-null  float64
 5   lat_y       93890 non-null  object 
 6   lon_y       93890 non-null  object 
 7   name_x      93890 non-null  object 
 8   admin1_x    93890 non-null  object 
 9   admin2_x    93890 non-null  object 
 10  cc_x        93890 non-null  object 
 11  lat         93890 non-null  object 
 12  lon         93890 non-null  object 
 13  name_y      93890 non-null  object 
 14  admin1_y    93890 non-null  object 
 15  admin2_y    93890 non-null  object 
 16  cc_y        93890 non-null  object 
 17  Text        62655 non-null  object 
 18  Top5%       62655 non-null  float64
 19  Emoji_1     62655 non-nul

In [95]:
len(set(all_tweets_df['id']))

79681

1       263084309148233730
1       263101027190972400
1       263117376432791550
2       263133532820500480
3       262706188590907400
               ...        
3099    262678633498415100
3100    262678633490038800
3101    262678635721400320
3102    262678510420758530
3103    262678510265585660
Name: id, Length: 13187, dtype: int64

In [81]:
all_tweets_df.drop_duplicates(subset=['id'])

Unnamed: 0,full_text,created_at,id,lon,lat
0,So there's going to be a hurricane tonight and...,Thu Oct 25 19:18:02 +0000 2012,261547183722082300,-80.062265,26.844194
1,They named the hurricane that will be coming u...,Thu Oct 25 19:18:42 +0000 2012,261547349921370100,-74.549170,40.618274
2,@Pototo_28 LMFAO!!!! I'm quite jealous that a ...,Thu Oct 25 19:18:46 +0000 2012,261547369261301760,-80.327574,25.870439
3,So we're suppose to get a hurricane Monday...,Thu Oct 25 19:19:06 +0000 2012,261547450463051780,-72.901798,40.912246
4,I'm pumped for this hurricane,Thu Oct 25 19:19:36 +0000 2012,261547576652881920,-74.100116,40.855067
...,...,...,...,...,...
547,I think this hurricane was the best thing that...,Mon Nov 05 08:32:20 +0000 2012,265370953847025660,-74.093305,40.239594
548,Via @teleSURtv: Cifra de muertos por huracán S...,Mon Nov 05 08:33:10 +0000 2012,265371165042831360,-66.879190,10.488010
549,Before & after pics of #hurricanesandy #sandy ...,Mon Nov 05 08:37:09 +0000 2012,265372164579024900,-79.936245,36.305958
550,Nama taufan pun Sandy.Ingat bila one day dah a...,Mon Nov 05 08:38:50 +0000 2012,265372591559163900,102.251083,6.115850
