In [1]:
import os
import pandas as pd
import random
import re
import json
import emoji
from datetime import datetime
random.seed(493)
import dask
import json
import dask.dataframe as dd
import dask.bag as db
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=2)

client

0,1
Client  Scheduler: tcp://127.0.0.1:61595  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB


In [2]:
def process_raw_data(path, name):
    if not os.path.exists(f"../data/{name}.pkl"):
        print(client, client.dashboard_link)
        b = db.read_text(f'{path}/x**').map(json.loads)
        flatten = lambda rec: {
                    'full_text': rec['full_text'],
                    'created_at': rec['created_at'],
                    'id': rec['id'],
                    'lon': rec['coordinates']['coordinates'][0],
                    'lat': rec['coordinates']['coordinates'][1]}

        tweets_df = b.filter(lambda record: record['coordinates'] is not None)
        tweets_df = tweets_df.map(flatten).to_dataframe().compute()
        print(f"saving ../data/{name}.pkl")
        tweets_df.drop_duplicates(subset=['id']).to_pickle(f"../data/{name}.pkl")
    else:
        tweets_df = pd.read_pickle(f"../data/{name}.pkl").drop_duplicates(subset=['id'])
    return tweets_df


all_tweets_df = process_raw_data("../data/big_data/hurricane_data/", 'abig')
all_tweets_df.head()

<Client: 'tcp://127.0.0.1:61595' processes=4 threads=8, memory=17.18 GB> http://127.0.0.1:8787/status
saving ../data/abig.pkl


Unnamed: 0,full_text,created_at,id,lon,lat
0,Everyone up north stay safe from hurricane sandy,Mon Oct 29 22:50:17 +0000 2012,263050150904201200,-97.256829,32.840356
1,Hurricane sandy relief rt? 8 days no power/hea...,Wed Nov 07 00:50:37 +0000 2012,265979533935644670,-73.477156,40.661248
2,Whoah thats huge. RT @HuffPostGreen: Complete ...,Fri Nov 02 02:56:48 +0000 2012,264199351025410050,-87.621128,41.8908
3,Caught a #wedding ❤️across the street from an ...,Sat Oct 06 19:31:43 +0000 2018,1048657069182804000,-77.896944,34.036667
4,“@TIME: Satellite photo of Tropical Storm Arth...,Thu Jul 03 04:34:03 +0000 2014,484555647254421500,-77.470397,37.553711


In [4]:
def preprocess(text):
    import preprocessor as p
    import html
    from multiprocessing.dummy import Pool as ThreadPool    
    p.set_options(p.OPT.ESCAPE_CHAR, p.OPT.URL)
    pool = ThreadPool()
    results = pool.map(lambda d : p.clean(html.unescape(d)), text)
    pool.close()
    pool.join()
    return results

all_tweets_df = all_tweets_df[all_tweets_df['full_text'].str.len() != 0]
all_tweets_df['full_text'] = preprocess(list(all_tweets_df['full_text'].values))
all_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37675 entries, 0 to 1314
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   full_text   37675 non-null  object 
 1   created_at  37675 non-null  object 
 2   id          37675 non-null  int64  
 3   lon         37675 non-null  float64
 4   lat         37675 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ MB


In [5]:
all_tweets_df['full_text'].sample(5).values

array(['#Natural_Hazards Sediment in New York Harbor: In the wake of Hurricane Irene’s heavy rains, sediment filled t...',
       'National Hurricane Center drops Tropical Storm Watch for Indian River and St. Lucie Counties in east central Florida. #Arthur #Tropics #flwx',
       'Знаєте,чому Бог створив спочатку чоловіка,а потім жінку?... спочатку робиться ескіз,ну а потім вже шедевр...',
       'RT @mashable Hurricane Irene: NASA Posts Dramatic Full-Earth View [PIC]',
       'Our thoughts go out to those who were effected by Hurricane Harvey. #psi #mentalhealth #therapy…'],
      dtype=object)

In [7]:
def get_inexact_location(latlong):
    import reverse_geocoder as rg
    results = rg.search(latlong)
    return pd.DataFrame(results)
locs = get_inexact_location(list(zip(all_tweets_df['lat'], all_tweets_df['lon'])))
locs

Loading formatted geocoded file...


Unnamed: 0,lat,lon,name,admin1,admin2,cc
0,32.85791,-97.25474,Watauga,Texas,Tarrant County,US
1,40.66593,-73.48818,Seaford,New York,Nassau County,US
2,41.85003,-87.65005,Chicago,Illinois,Cook County,US
3,34.03517,-77.8936,Carolina Beach,North Carolina,New Hanover County,US
4,37.55376,-77.46026,Richmond,Virginia,City of Richmond,US
...,...,...,...,...,...,...
37670,26.58368,-80.10032,Seminole Manor,Florida,Palm Beach County,US
37671,37.63049,-122.41108,San Bruno,California,San Mateo County,US
37672,33.98928,-83.4096,Country Club Estates,Georgia,Clarke County,US
37673,21.30694,-157.85833,Honolulu,Hawaii,Honolulu County,US


In [8]:
all_tweets_df = all_tweets_df.merge(locs, left_index=True, right_index=True).reset_index(drop=True)
all_tweets_df.to_pickle("../data/acleaned.pkl", protocol=4)

In [9]:
# ssh onto linux.ews.illinois.edu
! scp -r ../data/acleaned.pkl atharva2@linux.ews.illinois.edu:~/LING_506_MAC/data/
# big_proc.ipynb

stdin: is not a tty
scp: /home/atharva2/LING_506_MAC/data/: Is a directory


## Process Hurricane Path

In [22]:
# fn  = '../data/track.dat'
# rec = {'lat':[],'lon':[],'wind':[],'press':[],'dt':[],'cat':[]}
# for i,line in enumerate(open(fn)):
#     if i == 0: continue  # Jump over the first line
#     # replace multiple whitespaces with a single whitespace
#     line   = re.sub(r"\s+", ' ', line)
#     pieces = line.split(" ")
#     # retrieve information
#     rec['lat'].append(float(pieces[0]))
#     rec['lon'].append(float(pieces[1]))
#     rec['wind'].append(float(pieces[3]))
#     rec['press'].append(float(pieces[4]))
#     rec['cat'].append((" ".join(pieces[5:])).strip())
#     time   = pieces[2]
#     time   = "2012/" + time
#     rec['dt'].append(datetime.strptime(time,"%Y/%m/%d/%HZ"))
    
    
# track = pd.DataFrame.from_dict(rec)

# track.to_pickle("../data/track.pkl", protocol=4)

In [23]:
track = pd.read_csv("../data/track.csv")

track['TIME_'] = pd.to_datetime(track['TIME_'], format='%Y%m%d%H%M%S')

track.to_pickle("../data/track.pkl", protocol=4)