In [1]:
# interactive widget imports
import ipywidgets as wg
from IPython.display import display


# data handling modules
import pandas as pd
import numpy as np
from datetime import datetime

# plotting imports
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

plotly.tools.set_credentials_file(username='Flavioh', api_key='GogTSHQAuhgi5p724TsF')
plt.style.use('seaborn')

# web scraping
from bs4 import BeautifulSoup
import requests

# other helpers and suppress warnings
from helpers import *
from timeline_helpers import *
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

In [None]:
# Load data

df = pd.read_csv('../generated/FINAL_DF_WITH_TOPICS.csv', index_col=0)
df['publish_date'] = pd.to_datetime(df['publish_date'])
df = df[(df.publish_date > '2016-09-01') & (df.publish_date < '2016-11-10')]

In [None]:
df.head()

In [None]:
data_filtered = df[['publish_date', 'followers', 'following', 'author', 'like_count', 'topic']]

In [None]:
data_filtered.topic.value_counts()

In [None]:
import matplotlib.pyplot as plt

import numpy as np
%matplotlib inline

def count_topic(topic, df):
    """The function takes in a language as a DEFINE_string
    and goes through a dataframe that has one column named
    topic and another one that has publish_date. It then
    groups the tweets by month and returns the sum of the
    tweets during the time period"""

    filt = df[df.topic == topic].copy()
    filt['topic_num'] = filt.topic.map({topic:1})
    return filt.groupby(pd.Grouper(key='publish_date', freq='1D')).sum()

def trace_generator_topic(topic_df):
    """This function generates the data that will be used
    as input for the iplot function. It prepares the labels
    from the index"""
    data = []
    for topic in topic_df.topic.value_counts().index.values:
        filtered=count_topic(topic, topic_df)
        strd = pd.Series(filtered.index.strftime('%Y-%m-%d %H-%M-%S'))
        xlabels = list(strd.apply(lambda x: x[0:10]))
        trace = go.Scatter(x=xlabels,
                            y=filtered.topic_num.values,
                            fill='tozeroy',
                            mode= 'none',
                            name=topic)
        data.append(trace)

    return data



In [None]:
data = trace_generator_topic(data_filtered)
layout = do_layout('Date', 'Number of Tweets', 'Topic as a Function of Time')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='x')

# Examination of Top Authors

In [None]:
# Extract top 10 posting authors
top_authors = data_filtered.author.value_counts()[0:10]

# Filter data frame for them
###### testing with screamymonkey

topauthors = data_filtered[data_filtered.author.isin(top_authors.index.values)]
test = topauthors.copy()
# test = data_filtered[data_filtered.author == 'WORLDOFHASHTAGS']


In [None]:
test.head()

In [None]:
top_authors

In [None]:

data1 = trace_generator_topic(test)
layout = do_layout('Date', 'Number of Tweets', 'Topic for Top 10 Authors Over Time')
fig = go.Figure(data=data1, layout=layout)
py.iplot(fig, filename='y')

In [None]:
# Check out follower counts for top 10 hashtaggers

group_followers = test.groupby([pd.Grouper(key='publish_date', freq='1D'), 'author'])['followers'].max().groupby(level=0).sum()

######

test['tweet_counts'] = np.ones(test.shape[0])

group_likes =test.groupby([pd.Grouper(key='publish_date', freq='1D'),
                                 'author'])['like_count'].max().groupby(level=0).sum()

group_following =test.groupby([pd.Grouper(key='publish_date', freq='1D'),
                                 'author'])['following'].max().groupby(level=0).sum()
group_authors_time = pd.DataFrame({'followers': group_followers.values,
                                   'following': group_following.values},
                                  index=group_followers.index.values)

 
# group by month
general_timeline = test.groupby(pd.Grouper(key='publish_date', freq='1D')).sum()
# construct labels
xlabels = list(pd.Series(group_authors_time.index.strftime(
    '%Y-%m-%d %H-%M-%S')).apply(lambda x: x[0:10]))
data = []

for col in group_authors_time:
    filtered = group_authors_time[col].copy()
    trace = go.Scatter(x=xlabels, y=filtered.values, name=col, 
                       fill='tozeroy', mode='lines')
    data.append(trace)

filtered = general_timeline['tweet_counts'].copy()
trace=go.Scatter(x=xlabels, y=filtered.values, marker = {'color' : '#00AA00'}, name='Tweet Counts', fill='tozeroy', mode='lines')
data.append(trace)
trace = go.Scatter(x=xlabels, y=group_likes.values, marker = {'color' : '#FF2200'}, name='Tweet Likes', fill='tozeroy', mode='lines')
data.append(trace)



# Create plots for following, followers, updates, tweet_counts

fig = tools.make_subplots(rows=3, cols=1)

fig.append_trace(data[0], 1, 1)
fig.append_trace(data[1], 1, 1)
fig.append_trace(data[2], 2, 1)
fig.append_trace(data[3], 3, 1)

fig['layout'].update(height=600, width=800, title='General Trends Across Time')
py.iplot(fig, filename='y')


In [None]:
# Calculate correlation values

# Creating dataframe of entries for correlation statistics
corr_df = pd.DataFrame({'Followers': group_followers.values, 'Likes': group_likes.values,
                       'Following': group_following.values})
corr_df.index = xlabels                        

topics = data_filtered.topic.value_counts().index.values
for topic in topics:
    x = count_topic(topic, topauthors)
    corr_df = corr_df.join(x[['topic_num']], how='outer', rsuffix=topic)

corr_df.columns = ['Followers', 'Likes', 'Following', *topics]

corr_df.fillna(0, inplace=True)


In [None]:
print(corr_df.iloc[0:200].Sports.corr(corr_df.iloc[0:200].Followers))
print(corr_df.iloc[0:200]['w'].corr(corr_df.iloc[0:200].Followers))
print(corr_df.iloc[0:200]['Entertainment'].corr(corr_df.iloc[0:200].Followers))

In [None]:
print(corr_df.iloc[0:200].corr().iloc[:,0])
print(corr_df.iloc[-100:].corr().iloc[:,0])

# Swing State Stuff

In [2]:
import requests
from bs4 import BeautifulSoup
import lxml

# Scrape wikipedia table for american cities and states
website = requests.get('https://en.wikipedia.org/wiki/List_of_largest_cities_of_U.S._states_and_territories_by_population').text
soup = BeautifulSoup(website,'html.parser')

My_table = soup.find('table',{'class':'wikitable sortable'})

links = My_table.find_all('a')

places = []
for link in links:
    places.append(link.get('title'))
    
city_clean = list(filter(None.__ne__, places)) # Drop None values
cities = {}


for entry in city_clean:
    split=entry.split(', ')
    if len(split) == 1:
        cities[split[0]] = []
    else:
        if split[1] in cities.keys():
            cities[split[1]].append(split[0]) 
state_n_city = []
for state in cities.keys():
    state_list = []
    state_cities = cities[state]
    for city in state_cities:
        state_list.append(city + '|' + city.lower())
    state_list.append(state + '|' + state.lower())
    state_list = '|'.join(state_list)
    state_n_city.append([state, state_list])
    
state_n_city_dict = {}
for state in state_n_city:
    state_n_city_dict[state[0]] = state[1]

In [69]:
state_n_city_dict

{'Alabama': 'Birmingham|birmingham|Montgomery|montgomery|Huntsville|huntsville|Mobile|mobile|Tuscaloosa|tuscaloosa|Alabama|alabama',
 'Alaska': 'Anchorage|anchorage|Fairbanks|fairbanks|Juneau|juneau|Sitka|sitka|Ketchikan|ketchikan|Alaska|alaska',
 'American Samoa': "Tafuna|tafuna|Nu'uuli|nu'uuli|Pava'ia'i|pava'ia'i|American Samoa|american samoa",
 'Pago Pago': 'Pago Pago|pago pago',
 "'Ili'ili": "'Ili'ili|'ili'ili",
 'Arizona': 'Phoenix|phoenix|Tucson|tucson|Mesa|mesa|Chandler|chandler|Glendale|glendale|Arizona|arizona',
 'Arkansas': 'Little Rock|little rock|Fort Smith|fort smith|Fayetteville|fayetteville|Springdale|springdale|Jonesboro|jonesboro|Arkansas|arkansas',
 'California': 'San Jose|san jose|Fresno|fresno|Sacramento|sacramento|California|california',
 'Los Angeles': 'Los Angeles|los angeles',
 'San Diego': 'San Diego|san diego',
 'San Francisco': 'San Francisco|san francisco',
 'Colorado': 'Colorado Springs|colorado springs|Aurora|aurora|Fort Collins|fort collins|Lakewood|lakew

In [54]:
# Load old data set

election_month = pd.read_csv('../generated/onemonth_dataset.csv', index_col=0)
election_month['publish_date'] = pd.to_datetime(election_month['publish_date'])
election_month = election_month[election_month.language == 'English']


In [55]:
election_month = election_month[['author', 'content', 'publish_date', 'followers', 'following', 'topic']]

In [57]:
import numpy as np
election_month['value'] = np.ones(election_month.shape[0])

In [61]:
swing_states = ['Colorado',
               'Florida',
               'Iowa',
               'Michigan',
               'Nevada',
               'New Hampshire',
               'North Carolina',
               'Ohio',
               'Pennsylvania',
               'Virginia',
               'Wisconsin',
               'Texas']

In [68]:
# for each swing state, calculate the proportions of the different topic categories
for state in swing_states:
    filtered = election_month[election_month.content.str.contains(state_n_city_dict[state])].copy()
    print(state)
    print(filtered.groupby('topic')['value'].sum()/filtered.groupby('topic')['value'].sum().sum() * 100)

Colorado
topic
Anti-Islam            1.574803
Anti-Trump            2.362205
Black Support        21.259843
Crime                 5.511811
Entertainment         4.724409
Foreign Countries     0.787402
Health               11.811024
Sports               13.385827
Trump Support        38.582677
Name: value, dtype: float64
Florida
topic
Anti-Islam            1.831129
Anti-Trump            4.577823
Black Support        13.530010
Crime                 1.322482
Entertainment         8.443540
Foreign Countries     1.525941
Health                4.476094
Patriot               1.322482
Sports               22.278739
Trump Support        40.691760
Name: value, dtype: float64
Iowa
topic
Anti-Islam        1.550388
Anti-Trump        3.875969
Black Support    25.581395
Crime             3.875969
Entertainment     1.550388
Health            2.325581
Patriot           1.550388
Sports           10.852713
Trump Support    48.837209
Name: value, dtype: float64
Michigan
topic
Anti-Islam            0.34246

In [36]:
florida.shape

(983, 15)

In [43]:
x = pd.read_csv('../data/final_merged_tweets.csv', index_col=0)
x['publish_date'] = pd.to_datetime(x['publish_date'])
x = x[(x.publish_date > '2016-10-01') & (x.publish_date < '2016-11-10')]


Columns (6,8,12,23,24) have mixed types. Specify dtype option on import or set low_memory=False.


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [44]:
x.shape

(33285, 27)