In [2]:
import datetime

import pandas as pd
import thirdparty.tsearch.TwitterScraper as ts

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [7]:
bernie_search = ['Bernie', 'Sanders', 'BernieSanders']
rubio_search = ['Marco', 'Rubio', 'marcorubio']
cruz_search = ['Ted', 'Cruz', 'tedcruz']
clinton_search = ['Hillary', 'Clinton', 'HillaryClinton']
trump_search = ['Donald', 'Trump', 'DonaldTrump']

bernie_keywords = ['Bernie', 'Sanders', 'BernieSanders', 'Feel the Bern', 'Bern', 'feelthebern']
rubio_keywords = ['Marco', 'Rubio', 'marcorubio', '"A New American Century"']
cruz_keywords = ['Ted', 'Cruz', 'tedcruz', 'CruzCrew', '"Together, we will win"']
clinton_keywords = ['Hillary', 'Clinton', 'HillaryClinton', '"I\'m with her"', '"Stronger Together"', 'imwithher']
trump_keywords = ['Donald', 'Trump', 'DonaldTrump', 'NeverTrump', '"Make America Great Again"', 'maga']

primaries = {'Democrat': [('Clinton', clinton_search), 
                          ('Sanders', bernie_search)],
             'Republican': [('Rubio', rubio_search),
                            ('Cruz', cruz_search),
                            ('Trump', trump_search)]}

In [8]:
# Generate all the time-based and geo-based components from the csv
state_dict = dict()

with open('data/raw_calc_lat_lon.csv', 'r') as csvfile:
    tab = csv.reader(csvfile, delimiter=',')
    for row in tab:
        if row[1] != 'latitude':
            state = row[0]
            latitude = row[1]
            longitude = row[2]
            radius = row[3]
            date = row[4]
            party = row[5]
            after = row[6]
            until = row[7]
            
            if state not in state_dict:
                state_dict[state] = dict()
            
            state_dict[state][party] = (latitude, longitude, radius, after, until)
            

In [11]:
# Get all the tweets for a person
qs_list = []
tweets = dict()
for state in state_dict.keys():
    tweets[state] = dict()
    
    for primary in primaries.keys():
        tweets[state][primary] = dict()
        for candidate in primaries[primary]:
            
            latitude, longitude, radius, since, until = state_dict[state][primary]
            geo = ' geocode:%.6f,%.6f,%dkm' % (float(latitude), float(longitude), float(radius))
            query_string = " OR ".join(candidate[1]) + geo
            
            since = datetime.datetime.strptime(since, '%m/%d/%y')
            until = datetime.datetime.strptime(until, '%m/%d/%y')
    
            twitSlice = ts.TwitterSlicer(0, 5, since, until, 5)
            twitSlice.search(query_string)
            
            tweets[state][primary][candidate[0]] = twitSlice.tweets
            print('%s %s %s: Downloaded %d tweets.' % (state, primary, candidate[0], twitSlice.counter))
            

Alabama Democrat Clinton: Downloaded 20 tweets.
Alabama Democrat Sanders: Downloaded 5 tweets.
Alabama Republican Rubio: Downloaded 136 tweets.
Alabama Republican Cruz: Downloaded 52 tweets.
Alabama Republican Trump: Downloaded 176 tweets.
Alaska Democrat Clinton: Downloaded 0 tweets.
Alaska Democrat Sanders: Downloaded 4 tweets.
Alaska Republican Rubio: Downloaded 2 tweets.
Alaska Republican Cruz: Downloaded 9 tweets.
Alaska Republican Trump: Downloaded 0 tweets.
Arizona Democrat Clinton: Downloaded 43 tweets.
Arizona Democrat Sanders: Downloaded 91 tweets.
Arizona Republican Rubio: Downloaded 4 tweets.
Arizona Republican Cruz: Downloaded 35 tweets.
Arizona Republican Trump: Downloaded 161 tweets.
Arkansas Democrat Clinton: Downloaded 16 tweets.
Arkansas Democrat Sanders: Downloaded 2 tweets.
Arkansas Republican Rubio: Downloaded 15 tweets.
Arkansas Republican Cruz: Downloaded 10 tweets.
Arkansas Republican Trump: Downloaded 8 tweets.
California Democrat Clinton: Downloaded 59 tweets.

In [36]:
for state in tweets.keys():
    total = 0
    for party in tweets[state].keys():
        for candidate in tweets[state][party].keys():
            total += len(tweets[state][party][candidate])
    print(str(total) + ',')

389,
15,
334,
51,
739,
329,
86,
8,
0,
453,
339,
31,
47,
73,
302,
206,
71,
91,
22,
8,
298,
39,
9,
8,
30,
90,
10,
2,
5,
38,
122,
95,
96,
322,
0,
310,
140,
8,
62,
42,
123,
2,
164,
736,
65,
9,
54,
13,
22,
97,
2,


In [19]:
sid = SentimentIntensityAnalyzer()

In [30]:
df = pd.DataFrame(columns=['State', 'Primary', 'Candidate',])

In [31]:
i = 0
for _, state in enumerate(tweets):
    for party in tweets[state].keys():
        for candidate in tweets[state][party].keys():
            df.set_value(i, 'State', state)
            df.set_value(i, 'Primary', party)
            df.set_value(i, 'Candidate', candidate)
            for k in range(len(tweets[state][party][candidate])):
                tweet_text = tweets[state][party][candidate][k]['text']
                score = sid.polarity_scores(str(tweet_text))['compound']
                df.set_value(i, 't' + str(k), score)
            
            i += 1

In [39]:
df[:10]

Unnamed: 0,State,Primary,Candidate,t0,t1,t2,t3,t4,t5,t6,...,t301,t302,t303,t304,t305,t306,t307,t308,t309,t310
0,Alabama,Democrat,Clinton,0.0,0.4995,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,Alabama,Democrat,Sanders,-0.1779,-0.1779,0.0,0.6696,0.0,,,...,,,,,,,,,,
2,Alabama,Republican,Rubio,0.6705,0.5106,0.0,-0.2481,-0.0516,0.7615,0.0,...,,,,,,,,,,
3,Alabama,Republican,Cruz,-0.6705,0.0,0.5719,0.872,0.0,-0.3382,0.4389,...,,,,,,,,,,
4,Alabama,Republican,Trump,-0.2023,0.3976,0.0772,0.4215,-0.5469,0.5916,-0.2212,...,,,,,,,,,,
5,Alaska,Democrat,Clinton,,,,,,,,...,,,,,,,,,,
6,Alaska,Democrat,Sanders,0.8221,0.8221,0.0,0.0,,,,...,,,,,,,,,,
7,Alaska,Republican,Rubio,0.0,0.0,,,,,,...,,,,,,,,,,
8,Alaska,Republican,Cruz,0.4404,0.4404,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
9,Alaska,Republican,Trump,,,,,,,,...,,,,,,,,,,


In [38]:
df.to_csv('data/twitter/twitter_sentiment_scores.csv')