# This Code Transforms text data (tweets) into hourly waits for the Edmonds-Kingston Washington State Ferry run

## Import necessary packages

In [2]:
import pandas as pd
import glob
import numpy as np

## load data

In [3]:
all_files = glob.glob("./data/*.csv")

df_from_each_file = (pd.read_csv(f) for f in all_files)
df = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.columns

Index(['Tweet permalink', 'Tweet text', 'time'], dtype='object')

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

## Extract wait times from tweets

### Assumptions:
* wait-related tweets contain the word 'wait'
* edmonds-related tweets contain the full word 'edmonds'
* kingston-related tweets contain the full word 'kingston'
* numbers are spelled out or numerals

### Standard tweet Examples:
* Edm/King - Edmonds and Kingston Terminal Status - 2 Hour Wait
* Edm/King - Kingston Terminal Status - Two Hour Wait
* Edm/King - Edmonds Terminal Status - One Hour Wait
* Edm/King - No Extended Wait for Drivers Departing Edmonds
* Edm/King - One Hour Wait Departing Kingston and Edmonds

### Non-Standard tweet Examples:
* Edm/King - Update - Edmonds and Kingston Terminal Status, 2hrs Edm, 1hr King
* Edm/King - No Extended Wait in Kingston - One Hour Wait in Edmonds, Late Vessel
* Edm/King - Kingston 6:25am Departure is Cancelled. One Hr. Wait
* Edm/King - no longer an extended wait departing edmonds or kingston
* Edm/King - edmonds and kingston terminal status - 2 hour wait

In [6]:
# changing text to lowercase and removing the url, 
df['tweet_text'] = df['tweet_text'].str.lower().str.replace('https:.*', '')

# removing route indicator ('edm/king -'), extra whitespace
df['tweet_text'] = df['tweet_text'].str.replace('edm/king -', '').str.strip()

# removing wsp boarding pass indicataor
df['tweet_text'] = df['tweet_text'].str.replace(', no wsp boarding pass required|, wsp boarding pass required', '')

In [7]:
for tweet in df['tweet_text'][0:10]:
    print(tweet)

kingston 7:05am and edmonds 7:55am sailings cancelled - 2/4
walla walla back in service
edmonds terminal wait time - one hour
update - no extended wait in edmonds
update - no extended wait departing edmonds
edmonds terminal wait time - one hour
update - no extended wait departing edmonds
kingston terminal wait time - one hour
update - no extended wait departing kingston
edmonds terminal wait time - one hour


In [8]:
df_wait = df[df['tweet_text'].str.contains('wait')]

In [9]:
ed_df = df_wait[df_wait['tweet_text'].str.contains('edmonds')].copy()
ki_df = df_wait[df_wait['tweet_text'].str.contains('kingston')].copy()

In [10]:
df_wait.head()

Unnamed: 0,tweet_permalink,tweet_text,time
2,https://twitter.com/wsferries/status/962107604...,edmonds terminal wait time - one hour,2018-02-09 23:35 +0000
3,https://twitter.com/wsferries/status/962145354...,update - no extended wait in edmonds,2018-02-10 02:05 +0000
4,https://twitter.com/wsferries/status/962167999...,update - no extended wait departing edmonds,2018-02-10 03:35 +0000
5,https://twitter.com/wsferries/status/962422179...,edmonds terminal wait time - one hour,2018-02-10 20:25 +0000
6,https://twitter.com/wsferries/status/962483833...,update - no extended wait departing edmonds,2018-02-11 00:30 +0000


In [11]:
def get_hours(texts, locName, otherLocName, altNames):
    """Returns the hours of wait described in the texts for the location name
    
    Args:
      texts: a panadas series of text describing the waits
      locName: the name of the location to extract the time for
      otherLocName: the name of the location NOT getting times extracted
      altNames: a dictionary of the short names for locations (eg. {'Edmonds': ['Edm', 'Edms', 'E', 'Ed'],
                                                                    'Kingston': ['K', 'Kgstn', 'King']})
      
    Returns: list of ints representing the wait hours in each record of text for the specified location
    """
    
    return [get_hour(text, locName, otherLocName, altNames) for text in texts]
    

In [12]:
import re

def get_hour(text, locName, otherLocName, altNames):
    """Returns the hour of wait described in the text for the location name
    
    Args:
      text: a string describing the wait
      locName: the name of the location to extract the time for
      otherLocName: the name of the location NOT getting times extracted
      altNames: a dictionary of the short names for locations (eg. {'Edmonds': ['Edm', 'Edms', 'E', 'Ed'],
                                                                    'Kingston': ['K', 'Kgstn', 'King']})
      
    Returns: and int - the number of hours wait for the locName
    """
    # initialize hour and backup hour
    hour = None
    backup_hour = None
    
    # check for locName
    if locName not in text:
        return hour
    
    # check for otherLocName (ie. a dual-location message), and get the number for solo messages
    if otherLocName not in text:
        hour = get_num(text)
        
    # otherwise it's a dual-location message
    else:
        # split the message on - and ,
        split_texts = re.split('-|,', text)
        for split_text in split_texts:
            # initialize flag of this section containing the location of interest
            loc_in_text = False
            
            # check for full name and alternate/abreviated names
            if locName in split_text:
                loc_in_text = True
            for name in altNames[locName]:
                if name in split_text:
                    loc_in_text = True
            
            # if the location is in the split, get the hour
            if loc_in_text:
                hour = get_num(split_text)
            
            # if the location isn't in the split, get the hour as a backup
            else:
                backup_hour = get_num(split_text)
                
        # if none of the sections have the name and hour together, use the backup_hour
        if hour == None:
            hour = backup_hour
    return hour
        

In [13]:
import re

def get_num(text):
    """Returns the number contained in the text (assumes only one number 
    between 1 and 3, text or numeric). Returns 0 if text includes no, 
    extenced, and wait."""
    if bool(re.search('1|one|60 minute', text)): return 1
    elif bool(re.search('2|two', text)): return 2
    elif bool(re.search('3|three', text)): return 3
    elif bool(re.search('4|four', text)): return 4
    elif bool(re.search('90 min', text)): return 1.5
    elif bool(re.search('no.*wait', text)): return 0
    else: return None

## Investigating results - uncategorized tweets

In [14]:
altNames = {'edmonds': ['edm', 'edms', ' ed'], 'kingston': ['kgstn', 'king']}

In [15]:
ed_df['hours'] = get_hours(ed_df['tweet_text'], 'edmonds', 'kingston', altNames)

In [16]:
for tweet in ed_df.loc[ed_df['hours'].isna(),'tweet_text']:
    print(tweet)

edmonds terminal wait-time sign is out of service


In [17]:
ki_df['hours'] = get_hours(ki_df['tweet_text'], 'kingston', 'edmonds', altNames)

In [18]:
ki_df[ki_df['hours'].isna()]

Unnamed: 0,tweet_permalink,tweet_text,time,hours


In [19]:
for tweet in ki_df.loc[ki_df['hours'].isna(),'tweet_text']:
    print(tweet)

In [20]:
ed_df.head(20)

Unnamed: 0,tweet_permalink,tweet_text,time,hours
2,https://twitter.com/wsferries/status/962107604...,edmonds terminal wait time - one hour,2018-02-09 23:35 +0000,1.0
3,https://twitter.com/wsferries/status/962145354...,update - no extended wait in edmonds,2018-02-10 02:05 +0000,0.0
4,https://twitter.com/wsferries/status/962167999...,update - no extended wait departing edmonds,2018-02-10 03:35 +0000,0.0
5,https://twitter.com/wsferries/status/962422179...,edmonds terminal wait time - one hour,2018-02-10 20:25 +0000,1.0
6,https://twitter.com/wsferries/status/962483833...,update - no extended wait departing edmonds,2018-02-11 00:30 +0000,0.0
9,https://twitter.com/wsferries/status/964643063...,edmonds terminal wait time - one hour,2018-02-16 23:30 +0000,1.0
10,https://twitter.com/wsferries/status/964724851...,update - no extended wait departing edmonds,2018-02-17 04:55 +0000,0.0
11,https://twitter.com/wsferries/status/964945055...,edmonds terminal wait time - one hour,2018-02-17 19:30 +0000,1.0
15,https://twitter.com/wsferries/status/965059556...,update - no extended wait in edmonds,2018-02-18 03:05 +0000,0.0
21,https://twitter.com/wsferries/status/109147755...,edmonds terminal status - 1 hour wait,2019-02-01 23:25 +0000,1.0
