In [129]:
import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt

pd.options.display.max_rows = 999


### Importing Scraped Tweets

In [92]:
michael_df = pd.read_csv('./Data/michael.csv')
florence_df = pd.read_csv('./Data/florence.csv')
florence_df = florence_df[florence_df.username != 'SouthDakotaDOT']

## Removing Hyperlinks, Image Links, Certian Special Characters, and adding Date/time Columns

In [95]:
def clean_tweets (df):
    df['text'] = df['text'].str.replace(r'pic.twitter.com.*[\r\n]*', '', regex=True)
    df['text'] = df['text'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*',' ', x))
    df['text'] = df['text'].str.lower()
    # Converting column to datetime, adding time column, making data column
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = df['date'].dt.time
    df['date'] = df['date'].dt.date
    #removing uneeded columns
    df.drop(columns=['hashtags','type','geo'], inplace=True)
    # removing certain special characters
    df['text'] = df['text'].str.replace('.', '')
    df['text'] = df['text'].str.replace('/', '')
    df['text'] = df['text'].str.replace(',', '')
    df['text'] = df['text'].str.replace("'", '')
    df['text'] = df['text'].str.replace(";", '')
    df['text'] = df['text'].str.replace("(", '')
    df['text'] = df['text'].str.replace(")", '')
    return

    

clean_tweets(florence_df)
clean_tweets(michael_df)



## Adding States
---
Adding the state names will be essential when mapping out the locations of road closures or blockages. For Hurricane Michael and Harvey, we only observed one state, Florida and Texas, respectively. Hurrican Florence 

In [97]:
michael_df['state'] = michael_df['username'].map(lambda x: 'Georgia' if x in ['AlbanyGaPD','511Georgia','CityofAlbanyGA'] else 'Florida')
michael_df = michael_df[michael_df.state != 'Georgia']
florence_df['state'] = florence_df['username'].apply(lambda x: 'South Carolina' if x in ['SCDOTPeeDee' , 'SCDOTPiedmont','SCDOTMidlands','SCDOTLowCountry','SouthDakotaDOT'] else 'North Carolina')


## Split cell

In [98]:
michael_df['split_text'] = michael_df['text'].str.split()
florence_df['split_text'] = florence_df['text'].str.split()


In [100]:
florence_df.to_csv('florence_clean.csv', index=False)
michael_df.to_csv('michael_clean.csv', index=False)


## Making a list of incident
---
Adding status of roads for either closed, reopened, and the origin cause.


In [102]:
closed = ['closure', 'closed', 'road work', 'Maintenance', 'clsd','avoid',
          'stay off','block','blocked','shut down','inaccessible','closing', 
          'no vehicles', 'sealed', 'restricted', 'crash']


In [103]:
reopen = ['finish','finished', 'open','opened','ended', 'fixed',
          'reopened', 'repaired', 'cleared', 'concluded','restored']

In [104]:
cause = ['flood', 'flooded','crash','crashed', 'accident', 'collision',
         'collided', 'damage','damaged', 'repair', 'obstacle', 'obstacles', 'disabled',
         'wet', 'visibility', 'hazard', 'tree', 'weather', 'slick',
         'storm', 'maintenance','congestion','congested','weather','event']

## Adding Status
---
Removing any tweets that involved the reopening of a road or tweets involving no closures. 

In [106]:
def closed_open_cause(df):
    df['closed'] = df.text.str.extract('(?i)({0})'.format('|'.join(closed)))
    df['open'] = df.text.str.extract('(?i)({0})'.format('|'.join(reopen)))
    df['cause'] = df.text.str.extract('(?i)({0})'.format('|'.join(cause)))
    return

closed_open_cause(michael_df)
closed_open_cause(florence_df)

In [82]:
michael_df = michael_df.dropna(axis=0, subset=['closed'])
michael_df= michael_df[pd.isnull(michael_df['open'])]

florence_df = florence_df.dropna(axis=0, subset=['closed'])
florence_df= florence_df[pd.isnull(florence_df['open'])]

In [83]:
florence_df.to_csv('florence_clean.csv', index=False)
michael_df.to_csv('michael_clean.csv', index=False)
