In [1]:
# Imports

# interactive widget imports
import ipywidgets as wg
from IPython.display import display

# data handling modules
import pandas as pd
import numpy as np
from datetime import datetime

# plotting imports
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.tools.set_credentials_file(username='halima.schede', api_key='0BXIz4i3MnnYF4z7QhA0')

# web scraping
from bs4 import BeautifulSoup
import requests

# other helpers and suppress warnings
from helpers import *
from timeline_helpers import *
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

In [2]:
# Load data
tweets1 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_1.csv')
tweets2 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_2.csv')
tweets3 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_3.csv')
tweets4 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_4.csv')
tweets5 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_5.csv')
tweets6 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_6.csv')
tweets7 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_7.csv')
tweets8 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_8.csv')
tweets9 = pd.read_csv(access_folder('data') + 'IRAhandle_tweets_9.csv')



tweets = pd.concat([tweets1, tweets2, tweets3, tweets4, tweets5, tweets6, tweets7, tweets8, tweets9], axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)


In [3]:
tweets.dropna(how='any', axis=0, inplace=True)
tweets['publish_date'] = pd.to_datetime(tweets['publish_date'], format='%m/%d/%Y %H:%M')
# Since the values of tweets are minimal before 2015 filter dataframe
tweets_processed = tweets[tweets.publish_date > '2015']

In [4]:
# Scrape wikipedia table for american cities and states
website = requests.get('https://simple.wikipedia.org/wiki/List_of_United_States_cities_by_population').text
soup = BeautifulSoup(website,'html5lib')
My_table = soup.find('table',{'class':'wikitable sortable'})
links = My_table.find_all('a')

places = []
for link in links:
    places.append(link.get('title'))
    
    
city_clean = list(filter(None.__ne__, places)) # Drop None values

# Initialize lists for city and state names
cities = []
states = []
for x in range(0, len(city_clean)):
    if x%2 == 0:
        cities.append(city_clean[x].split(',')[0]) # Retrieve only city name, not state
    else:
        states.append(city_clean[x]) # append state name

In [5]:
# Filter for english tweets
tweets_english = tweets_processed[tweets_processed.language == 'English']

In [6]:
# count occurences of tweets for names in cities, and names in states
tweets_city = list(map(lambda x: tweets_english.content.str.contains(x).sum(), cities))
tweets_state = list(map(lambda x: tweets_english.content.str.contains(x).sum(), states))
 

In [7]:
city_counts = pd.DataFrame(data={'city' : cities, 'city_counts':tweets_city})
state_counts = pd.DataFrame(data={'state' : states, 'state_counts':tweets_state})

In [8]:
cities_states = pd.concat([city_counts, state_counts], axis=1)
df_citycounts_state = cities_states[['city', 'city_counts', 'state']]
grouped_state = df_citycounts_state.groupby('state')['city_counts'].sum()
ordered_state_city = grouped_state.reset_index()

### Mapping by State Name Occurence

In [9]:
ordered_state = state_counts.drop_duplicates().sort_values(by='state').reset_index(drop=True)
ordered_state['code'] = pd.Series(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL',
                                  'GA', 'HI','ID','IL','IN','IA','KS','KY','LA','MD','MA',
                                  'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY',
                                  'NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX',
                                  'UT','VA','WA','WA','WI'])
ordered_state.drop(44, inplace=True)
ordered_state.loc[44] = ['Wyoming', 0, 'WY']

In [10]:
data = [ dict(
        type='choropleth',
        colorscale = 'Bluered',
        autocolorscale = False,
        locations = ordered_state['code'],
        z = ordered_state['state_counts'].astype(float).apply(lambda x: np.log(x)),
        locationmode = 'USA-states',
        text = ordered_state['state'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Number of Tweets (Log-value)")
        ) ]

layout = dict(
        title = 'States Mentioned in Tweets',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

### Mapping by City Name Occurence by State

In [None]:
ordered_state_city['code'] = pd.Series(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL',
                                  'GA', 'HI','ID','IL','IN','IA','KS','KY','LA','MD','MA',
                                  'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY',
                                  'NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX',
                                  'UT','VA','WA','WA','WI'])
ordered_state_city.drop(index=44, inplace=True)
ordered_state_city.loc[44] = ['Wyoming', 0, 'WY']

In [None]:
data = [ dict(
        type='choropleth',
        colorscale = 'Bluered',
        autocolorscale = False,
        locations = ordered_state_city['code'],
        z = ordered_state_city['city_counts'].astype(float).apply(lambda x: np.log(x)),
        locationmode = 'USA-states',
        text = ordered_state_city['state'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Number of Tweets")
        ) ]

layout = dict(
        title = 'Cities Mentioned in Tweets by State',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

### Timeline analysis of state names in tweets

In [11]:
states_plot = ordered_state.sort_values('state_counts', ascending=False).state.values

In [12]:
plot_states = []
for i in states_plot:
    plot_states.append(extract_states(i, tweets_english))

In [13]:
data = []

for ind, i in enumerate(states_plot[0:10]):
    strd = pd.Series(plot_states[ind].index.strftime('%Y-%m-%d %H-%M-%S'))
    xlabels = list(strd.apply(lambda x: x[0:7]))
    trace = go.Scatter(x = xlabels, y = plot_states[ind].values, mode = 'lines', name=i,
                      groupnorm='percent')
    data.append(trace)

layout = do_layout('Date', 'Number of Tweets', 'State Mentions over Time')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='line-mode')

### Query by User Input

In [14]:
# Querying for string defined by user

query = wg.Text(value='Query string')
display(query)

Text(value='Query string')

In [16]:
query_df = extract_states(query.get_interact_value(), tweets_english)
strd = pd.Series(query_df.index.strftime('%Y-%m-%d %H-%M-%S'))
xlabels = list(strd.apply(lambda x: x[0:7]))
trace = go.Scatter(x = xlabels, y = query_df.values, mode = 'lines',
                   name=query.get_interact_value(),
                   groupnorm='percent')
data = []
data.append(trace)
layout = do_layout('Date', 'Number of Tweets', '{} Mentions over Time'.format(
    query.get_interact_value()))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='query')