In [1]:
import os
import pandas as pd
import random
import re
import json
import emoji
from datetime import datetime
random.seed(493)
import dask
import json
import dask.dataframe as dd
import dask.bag as db
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=2)
client

In [None]:
dset = 'all'

In [21]:
def process_raw_data(path, dset, name):
    if not os.path.exists(f"../data/pickles/{dset}/{name}.pkl"):
        print(client, client.dashboard_link)
        b = db.read_text(f'{path}/x**').map(json.loads)

        flatten = lambda rec: {
                    'full_text': rec['full_text'],
                    'created_at': rec['created_at'],
                    'id': rec['id'],
                    'lon': rec['coordinates']['coordinates'][0],
                    'lat': rec['coordinates']['coordinates'][1]}

        tweets_df = b.filter(lambda record: record['coordinates'] is not None)
        tweets_df = tweets_df.map(flatten).to_dataframe().compute()
        print(f"saving ../data/pickles/{dset}/{name}.pkl")
        tweets_df.drop_duplicates(subset=['id']).to_pickle(f"../data/pickles/{dset}/{name}.pkl", protocol=4)
    else:
        tweets_df = pd.read_pickle(f"../data/pickles/{dset}/{name}.pkl").drop_duplicates(subset=['id'])
    return tweets_df


all_tweets_df = process_raw_data(f"../data/big_data/{dset}_data/", dset, 'big')
all_tweets_df.head()

Unnamed: 0,full_text,created_at,id,lon,lat
0,Everyone up north stay safe from hurricane sandy,Mon Oct 29 22:50:17 +0000 2012,263050150904201200,-97.256829,32.840356
1,Hurricane sandy relief rt? 8 days no power/hea...,Wed Nov 07 00:50:37 +0000 2012,265979533935644670,-73.477156,40.661248
2,Whoah thats huge. RT @HuffPostGreen: Complete ...,Fri Nov 02 02:56:48 +0000 2012,264199351025410050,-87.621128,41.8908
3,Caught a #wedding ❤️across the street from an ...,Sat Oct 06 19:31:43 +0000 2018,1048657069182804000,-77.896944,34.036667
4,“@TIME: Satellite photo of Tropical Storm Arth...,Thu Jul 03 04:34:03 +0000 2014,484555647254421500,-77.470397,37.553711


In [22]:
def preprocess(text):
    import preprocessor as p
    import html
    from multiprocessing.dummy import Pool as ThreadPool    
    p.set_options(p.OPT.ESCAPE_CHAR, p.OPT.URL)
    pool = ThreadPool()
    results = pool.map(lambda d : (html.unescape(d)), text)
    pool.close()
    pool.join()
    return results

all_tweets_df = all_tweets_df[all_tweets_df['full_text'].str.len() != 0]
all_tweets_df['full_text'] = preprocess(list(all_tweets_df['full_text'].values))
all_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37670 entries, 0 to 1314
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   full_text   37670 non-null  object 
 1   created_at  37670 non-null  object 
 2   id          37670 non-null  int64  
 3   lon         37670 non-null  float64
 4   lat         37670 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ MB


In [23]:
all_tweets_df['full_text'].sample(5).values

array(['In Every Store You Can Donate To Hurricane Harvey Victims #helpharveyvictims @ Family Dollar https://t.co/pqNpfRAr62',
       'Hallowen (@ Frankenstorm Apocalypse - Hurricane Sandy w/ 159 others) http://t.co/AKzTypw',
       '“@amyhoyt13: Everything #Isaac at http://t.co/Ii9SdLhb” Amy main anchor in #PanamaCity, FL, where the storm is expected to make landfall.',
       'If you fill your tub with water as a prep for Hurricane Sandy, please make sure to remember Backstroke flags, so you can get 2 swim practice',
       "#PuertoRico is facing another Hurricane this week. #Maria is coming. I'm at #Walmart and there… https://t.co/YldnAmjwQ2"],
      dtype=object)

In [24]:
def get_inexact_location(latlong):
    import reverse_geocoder as rg
    results = rg.search(latlong)
    return pd.DataFrame(results)
locs = get_inexact_location(list(zip(all_tweets_df['lat'], all_tweets_df['lon'])))
locs

Unnamed: 0,lat,lon,name,admin1,admin2,cc
0,32.85791,-97.25474,Watauga,Texas,Tarrant County,US
1,40.66593,-73.48818,Seaford,New York,Nassau County,US
2,41.85003,-87.65005,Chicago,Illinois,Cook County,US
3,34.03517,-77.8936,Carolina Beach,North Carolina,New Hanover County,US
4,37.55376,-77.46026,Richmond,Virginia,City of Richmond,US
...,...,...,...,...,...,...
37665,26.58368,-80.10032,Seminole Manor,Florida,Palm Beach County,US
37666,37.63049,-122.41108,San Bruno,California,San Mateo County,US
37667,33.98928,-83.4096,Country Club Estates,Georgia,Clarke County,US
37668,21.30694,-157.85833,Honolulu,Hawaii,Honolulu County,US


In [25]:
# all_tweets_df = all_tweets_df[all_tweets_df['full_text'].str.len() != 0]
all_tweets_df = all_tweets_df.merge(locs, left_index=True, right_index=True).reset_index(drop=True)
print(all_tweets_df.drop_duplicates('id').shape, all_tweets_df.shape)
all_tweets_df.drop_duplicates('id').to_pickle(f"../data/pickles/{dset}/cleaned.pkl", protocol=4)

(37670, 11) (37670, 11)


In [7]:
!scp -r ../data/pickles/$dset/cleaned.pkl  atharva2@linux.ews.illinois.edu:~/LING_506_MAC/
!scp -r ../data/pickles/$dset/big.pkl  atharva2@linux.ews.illinois.edu:~/LING_506_MAC/


stdin: is not a tty
cleaned.pkl                                   100% 7312KB   7.3MB/s   00:00    
stdin: is not a tty
big.pkl                                       100% 6160KB   3.3MB/s   00:01    


In [8]:
# big_proc.ipynb

In [26]:
! scp -r atharva2@linux.ews.illinois.edu:~/LING_506_MAC/bigmoji.pkl ../data/pickles/$dset/
! scp -r atharva2@linux.ews.illinois.edu:~/LING_506_MAC/cleanmoji.pkl ../data/pickles/$dset/    

stdin: is not a tty
bigmoji.pkl                                   100% 7253KB   6.4MB/s   00:01    
stdin: is not a tty
cleanmoji.pkl                                 100%   15MB   8.2MB/s   00:01    


In [8]:
print("Done!")

Done!


## Process Hurricane Path

In [9]:
fn  = '../data/track.csv'
track = pd.read_csv(fn)
track['TIME_'] = pd.to_datetime(track['TIME_'], format='%Y%m%d%H%M%S')
track.to_pickle(f"../data/pickles/{dset}/track.pkl", protocol=4)

In [10]:
emoji2sentdf = pd.read_html("http://kt.ijs.si/data/Emoji_sentiment_ranking", )[0]
emoji2sentdf.to_pickle("./../data/pickles/emoji2send.pkl", protocol=4)