In [1]:
import pandas as pd
import twint
import glob
import math
import json
import os
import re
import numpy as np

from tldextract import extract

import matplotlib.pyplot as plt

import collections
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import neighbors, datasets
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor

import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 

In [2]:
## Define global constants

REPUBLICAN = 1
DEMOCRAT = 0

In [3]:
## Define global instances of SKL/NLTK classes

ps = PorterStemmer()

## 1. Explore Congressional Data

In [4]:
congressmen = pd.read_csv("Congressmen.csv")
congressmen['Party'].value_counts()

# Democratic = 0
# Republican = 1

0    262
1    244
Name: Party, dtype: int64

# Week 1: Congressional Tweets

In [5]:
bash_script = ""

startIndex = 235
for index, row in congressmen.iterrows():
    
    # We may need to resume downloading in the middle
    # So, we're going to allow the process to resume
    # from an arbitrary index in the dataframe
    if index < startIndex:
        continue
        
    # We need to print this information so that if the
    # process hangs, we know what index we disconnected at
    limit = 300
    name = row['Name']
    twitter = row['username']
    
    # We can't efficiently run twint in python, so we 
    # generate a shell script to download the tweets instead
    bash_script += "twint -u " + twitter + " -o tweets_politicians/" + twitter + ".csv --csv --limit " + str(limit) + " && \n"
    
bash_script += "echo Done!"
    
file = open("download_tweets.sh", "w")
file.write(bash_script)
file.close()

#### A. Combine all user extracts

In [6]:
tweet_files = glob.glob("tweets_politicians/*.csv")

In [7]:
try:
    os.remove("tweets.csv")
except OSError:
    pass

In [8]:
output_database = open("tweets.csv","a")

# first file:
for line in open(tweet_files[0]):
    output_database.write(line)
# now the rest:    
for num in range(1,len(tweet_files)):
    
    f = open(tweet_files[num], 'r') 
    for line in f.readlines()[1:]:
        output_database.write(line)
    f.close() # not really needed
    
output_database.close()

#### B. Define Transformation Functions

In [9]:
def tidy_possible_url(string):
    if "pic.twitter.com" in string:
        return ""
    isURL = string.startswith("http://") or string.startswith("https://")
    if isURL:
        tsd, td, tsu = extract(string) # gives you: www, hostname, com
        return td + "." + tsu
    else:
        return string

In [10]:
def transform_tweet(content):
    content = content.lower()    
    content = ' '.join([word for word in content.split() if word not in stopwords.words('english')])    
    content = ' '.join([tidy_possible_url(word) for word in content.split()])
    content = re.sub(r'[^0-9a-zA-Z@ ]+', '', content)
    content = content.replace('#', '')
    content = ' '.join([ps.stem(word) for word in content.split()])
    return content

In [11]:
# "Playground" example to verify that the transform_tweet function is working as expected
print(transform_tweet("I just read pic.twitter.com/saowyr2gvo \n#HELLO$ $'$^%$*(&^)'"))

read hello


In [12]:
def most_interesting_ngrams(n, count):
    
    corpus   = " ".join(tweets['tweet'])
    grams    = ngrams(corpus.split(), n)
    counts   = collections.Counter(grams)
    
    indices = list(set(counts.keys()))
    
    r_usages = dict.fromkeys(indices, 0)
    d_usages = dict.fromkeys(indices, 0)
    
    r_users = dict.fromkeys(indices, 0)
    d_users = dict.fromkeys(indices, 0)
    
    for index, item in congressmen.iterrows():
        
        currentPersonTweets = tweets.loc[tweets['username'] == item['username']]        
        currentPersonCorpus = " ".join(currentPersonTweets['tweet'])
        currentPersonGrams   = ngrams(currentPersonCorpus.split(), n)
        currentPersonGramCounts = collections.Counter(currentPersonGrams)
        for cpg in currentPersonGramCounts.keys():
            if item['Party'] == 0:
                d_usages[cpg] += currentPersonGramCounts[cpg]
                d_users[cpg] += 1
            elif item['Party'] == 1:
                r_usages[cpg] += currentPersonGramCounts[cpg]
                r_users[cpg] += 1

    
    importance = dict.fromkeys(counts.keys(), 0)
    
    for i in indices:
        importance[i] = abs(r_users[i] - d_users[i])**2 * (abs(r_usages[i] - d_usages[i]))**(1.0/1.5)
        
        
    sorted_importance = sorted(importance.items() , reverse=True, key=lambda x: x[1])
    sorted_importance = sorted_importance[0:count]
    
    return [x[0] for x in sorted_importance]

In [13]:
def generate_vocabulary_map(array_of_ngram_arrays):
    vocab = {}    
    index = 0
    for arr in array_of_ngram_arrays:
        for ng in arr:
            vocab[' '.join(ng)] = index
            index += 1
    return vocab

In [14]:
def transform_dataset(df):
    corpus = df['tweet'].values
    vocabulary = generate_vocabulary_map([
        most_interesting_ngrams(1, 500),
        most_interesting_ngrams(2, 1000),
        most_interesting_ngrams(3, 2500),
    ])
    
    print(vocabulary)
            
    vectorizer = CountVectorizer(vocabulary=vocabulary, ngram_range=(1,3))
    
    X = vectorizer.fit_transform(corpus)
    X = X.toarray()
    Y = df['Party'].values
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X,Y

#### C. Combine with biographical information

In [15]:
tweets = pd.read_csv("tweets.csv")
tweets = tweets.merge(congressmen[['username', 'State', 'Party' ]], on=['username'])

In [16]:
tweets['tweet'] = tweets['tweet'].apply(lambda t: transform_tweet(t))

In [17]:
tweets['tweet']

0       bolster stock ppe southern indiana john riley ...
1       team reach local hospit check in mani work kee...
2       listen in join @chamberbloom discuss caresact ...
3       visit websit housegov find inform everyth appl...
4       hoosier work togeth fight coronaviru numer res...
                              ...                        
6135    radianc technolog inc ceo william c bailey wil...
6136    soldier sailor airmen marin willingli place li...
6137    met @canopychildren ceo dr john d damon hear g...
6138    david josh ashley oper fastgrow small busi jac...
6139    great visit @15thsma listen idea improv qualit...
Name: tweet, Length: 6140, dtype: object

#### D. Remove unnecessary columns

In [18]:
tweets = tweets[['date','tweet', 'username', 'Party', 'hashtags', 'mentions']]

In [19]:
tweets['tweet']

0       bolster stock ppe southern indiana john riley ...
1       team reach local hospit check in mani work kee...
2       listen in join @chamberbloom discuss caresact ...
3       visit websit housegov find inform everyth appl...
4       hoosier work togeth fight coronaviru numer res...
                              ...                        
6135    radianc technolog inc ceo william c bailey wil...
6136    soldier sailor airmen marin willingli place li...
6137    met @canopychildren ceo dr john d damon hear g...
6138    david josh ashley oper fastgrow small busi jac...
6139    great visit @15thsma listen idea improv qualit...
Name: tweet, Length: 6140, dtype: object

#### E. Normalize the number of Republican and Democrats

In [20]:
r_tweets = tweets.loc[tweets['Party'] == REPUBLICAN]
d_tweets = tweets.loc[tweets['Party'] == DEMOCRAT]

In [21]:
r_tweet_count = len(r_tweets)
d_tweet_count = len(d_tweets)

difference = abs(r_tweet_count - d_tweet_count)

print("Started with " + str(r_tweet_count) + " Republican, " + str(d_tweet_count) + " Democrat")

if r_tweet_count > d_tweet_count:
    to_remove = np.random.choice(tweets[tweets['Party']==REPUBLICAN].index,size=difference,replace=False)
    tweets = tweets.drop(to_remove)
elif d_tweet_count > r_tweet_count:
    to_remove = np.random.choice(tweets[tweets['Party']==DEMOCRAT].index,size=difference,replace=False)
    tweets = tweets.drop(to_remove)

Started with 3140 Republican, 3000 Democrat


In [22]:
r_tweets = tweets.loc[tweets['Party'] == REPUBLICAN]
d_tweets = tweets.loc[tweets['Party'] == DEMOCRAT]

r_tweet_count = len(r_tweets)
d_tweet_count = len(d_tweets)

print("Classes normalized" if (r_tweet_count - d_tweet_count) == 0 else "Classes NOT normalized")

Classes normalized


#### F. Split into training and testing

In [23]:
X, Y = transform_dataset(tweets)

{'@potu': 0, 'west': 1, 'discrimin': 2, 'town': 3, 'god': 4, 'censu': 5, 'librari': 6, 'terrorist': 7, 'count': 8, 'pelosi': 9, 'forthepeopl': 10, 'supplement': 11, '@usda': 12, 'partisan': 13, 'condit': 14, 'enrol': 15, 'said': 16, 'comprehens': 17, 'mental': 18, 'democrat': 19, 'unpreced': 20, 'condemn': 21, 'pm': 22, 'facebookcom': 23, 'gun': 24, 'foxnewscom': 25, 'mobil': 26, 'medicaid': 27, 'legal': 28, 'montgomeri': 29, 'manufactur': 30, 'remov': 31, 'telephon': 32, 'confid': 33, 'paid': 34, 'chanc': 35, 'climat': 36, 'usmcanow': 37, 'decid': 38, 'cnbccom': 39, '2020censu': 40, 'regim': 41, 'appropri': 42, 'central': 43, 'capabl': 44, 'facebook': 45, 'ad': 46, '2020censusgov': 47, 'result': 48, 'progrowth': 49, 'key': 50, 'border': 51, 'gender': 52, 'equalpayday': 53, 'size': 54, 'jobsreport': 55, 'aliv': 56, 'recommit': 57, 'soldier': 58, 'unborn': 59, 'phone': 60, 'daca': 61, 'lender': 62, 'instead': 63, 'rancher': 64, 'far': 65, 'kick': 66, 'custom': 67, 'pray': 68, 'connect':

In [24]:
XTrain, XTest, YTrain, YTest = train_test_split(
    X, Y, test_size=0.33
)

#### G. Try many classifier types

In [25]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(
    solver='adam',
    activation='relu',
    alpha=1e-5,
    hidden_layer_sizes=(2000,), 
    max_iter=200
)
nn.fit(XTrain, YTrain)
print(nn.score(XTest, YTest))

0.8146464646464646


In [26]:
# Train
clf = DecisionTreeClassifier().fit(XTrain, YTrain)

YTestPred = clf.predict(XTest)
confusion = confusion_matrix(YTest, YTestPred)

scoreAgainstTraningSet = clf.score(XTrain, YTrain)
scoreAgainstTestingSet = clf.score(XTest,YTest)
print("Model Score (against training): " + str(scoreAgainstTraningSet) );
print("Model Score (against testing): " + str(scoreAgainstTestingSet) );

Model Score (against training): 0.9559701492537314
Model Score (against testing): 0.7065656565656566


In [27]:
#from sklearn.neighbors import KNeighborsClassifier
k = 9   # number of nearest neighbors
knnPred = neighbors.KNeighborsClassifier(n_neighbors = k, weights='distance')
knnPred.fit(XTrain, YTrain);
print(knnPred)


accuracy = knnPred.score(XTest,YTest)
print(accuracy)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='distance')
0.6974747474747475


# Week 2: Citizen Tweets

In [28]:
citizen_tweets = pd.read_csv('citizen_tweets.csv')

In [29]:
print(citizen_tweets)

       Hashtag  Party                                              tweet
0         MAGA      1  Yeah. You Democrats are loving it aren't you? ...
1         MAGA      1  Hopefully the dimwits that voted for one of th...
2         MAGA      1  #QAnon #QArmy #MAGA #QAnon #GreatAwakening #Sp...
3         MAGA      1  That Woman is a disgrace, she adds nothing to ...
4         MAGA      1         Same to you GIRL #MAGA #KAG2020 #fightback
..         ...    ...                                                ...
274  SleepyJoe      1  The Dem president, That gave the Iranians $1.8...
275  SleepyJoe      1  What‚Äôs good comedy, socialist slaughter in t...
276  SleepyJoe      1  You can tell a lot about somebody by who they ...
277  SleepyJoe      1  Actually...President Trump predicted you would...
278  SleepyJoe      1  Son, you sound like a person who could really ...

[279 rows x 3 columns]


In [30]:
def remove_tag(hashtag, content):
    return content.replace("#" + hashtag, "")

In [31]:
hashtags = citizen_tweets['Hashtag'].unique()

for hashtag in hashtags:
    
    mask = (citizen_tweets['Hashtag'] == hashtag)
    temp_tweets = citizen_tweets[mask]
        
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(lambda h: remove_tag(hashtag, h))  
    citizen_tweets.loc[mask, 'tweet'] = temp_tweets['tweet']
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [32]:
citizen_tweets['tweet'] = citizen_tweets['tweet'].apply(lambda t: transform_tweet(t))

In [33]:
citizen_tweets['tweet']

0      yeah democrat love you guess what @potu @reald...
1      hope dimwit vote one socialist communist rathe...
2      qanon qarmi qanon greatawaken spygat stoptheco...
3      woman disgrac add noth discuss covid doesnt ca...
4                                 girl kag2020 fightback
                             ...                        
274    dem presid gave iranian 18 billion secret deal...
275    what good comedi socialist slaughter former us...
276    tell lot somebodi hang with @realdonaldtrump s...
277    actuallypresid trump predict would foolishli t...
278    son sound like person could realli use good da...
Name: tweet, Length: 279, dtype: object

In [34]:
citizenDf = pd.DataFrame({
    'tweet': citizen_tweets['tweet'].values,
    'Party': citizen_tweets['Party'].values,
    'Hashtag': citizen_tweets['Hashtag'].values
})

In [35]:
CX, CY = transform_dataset(citizenDf)

{'@potu': 0, 'west': 1, 'discrimin': 2, 'town': 3, 'god': 4, 'censu': 5, 'librari': 6, 'terrorist': 7, 'count': 8, 'pelosi': 9, 'forthepeopl': 10, 'supplement': 11, '@usda': 12, 'partisan': 13, 'condit': 14, 'enrol': 15, 'said': 16, 'comprehens': 17, 'mental': 18, 'democrat': 19, 'unpreced': 20, 'condemn': 21, 'pm': 22, 'facebookcom': 23, 'gun': 24, 'foxnewscom': 25, 'mobil': 26, 'medicaid': 27, 'legal': 28, 'montgomeri': 29, 'manufactur': 30, 'remov': 31, 'telephon': 32, 'confid': 33, 'paid': 34, 'chanc': 35, 'climat': 36, 'usmcanow': 37, 'decid': 38, 'cnbccom': 39, '2020censu': 40, 'regim': 41, 'appropri': 42, 'central': 43, 'capabl': 44, 'facebook': 45, 'ad': 46, '2020censusgov': 47, 'result': 48, 'progrowth': 49, 'key': 50, 'border': 51, 'gender': 52, 'equalpayday': 53, 'size': 54, 'jobsreport': 55, 'aliv': 56, 'recommit': 57, 'soldier': 58, 'unborn': 59, 'phone': 60, 'daca': 61, 'lender': 62, 'instead': 63, 'rancher': 64, 'far': 65, 'kick': 66, 'custom': 67, 'pray': 68, 'connect':

In [36]:
def accuracy_on_citizen_tweets(name, predictions):
    
    hashtag_success = {}
    hashtag_total = {}
    for hashtag in list(set(citizen_tweets['Hashtag'].values)):
        hashtag_success[hashtag] = 0
        hashtag_total[hashtag] = 0
        
    correctR = 0
    correctD = 0
    correct = 0

    totalR = 0
    totalD = 0
    total = 0

    for i in range(0, len(predictions)):
    
        row = citizenDf.iloc[i]
        hashtag = row['Hashtag']
        
        if predictions[i] == CY[i]:
            hashtag_success[hashtag] += 1
            if CY[i] == REPUBLICAN:
                correctR += 1
            elif CY[i] == DEMOCRAT:
                correctD += 1
            correct += 1
    
        if CY[i] == REPUBLICAN:
            totalR += 1
        elif CY[i] == DEMOCRAT:
            totalD += 1
        
        hashtag_total[hashtag] += 1
        total += 1
    
    print("===============================================")
    
    print("OVERALL: " + str(correct/total))
    print(" * R: " + str(correctR) + "/" + str(totalR) + " ~ " + str(correctR/totalR))
    print(" * D: " + str(correctD) + "/" + str(totalD) + " ~ " + str(correctD/totalD))

    print("===============================================")

    for hashtag in hashtag_success:
        print(" # " + hashtag + ": " + str(hashtag_success[hashtag] / hashtag_total[hashtag]) )
    
    print("===============================================")

In [37]:
accuracy_on_citizen_tweets("Neural Network", nn.predict(CX))

OVERALL: 0.5913978494623656
 * R: 64/125 ~ 0.512
 * D: 101/154 ~ 0.6558441558441559
 # MAGA: 0.75
 # 2A: 0.3333333333333333
 # GunControl: 0.8333333333333334
 # SleepyJoe: 0.5
 # VoteBlue: 0.5833333333333334
 # VoteRed: 0.45161290322580644
 # ProLife: 0.6346153846153846
 # GreenNewDeal: 0.5384615384615384
 # BlueNoMatterWho: 0.7916666666666666
 # Trump2020Landslide: 0.5


In [38]:
accuracy_on_citizen_tweets("Decision Tree", clf.predict(CX))

OVERALL: 0.5304659498207885
 * R: 107/125 ~ 0.856
 * D: 41/154 ~ 0.2662337662337662
 # MAGA: 0.5833333333333334
 # 2A: 0.8333333333333334
 # GunControl: 0.3333333333333333
 # SleepyJoe: 0.8846153846153846
 # VoteBlue: 0.26666666666666666
 # VoteRed: 0.8387096774193549
 # ProLife: 0.5
 # GreenNewDeal: 0.5
 # BlueNoMatterWho: 0.08333333333333333
 # Trump2020Landslide: 0.875


In [39]:
accuracy_on_citizen_tweets("KNN", clf.predict(CX))

OVERALL: 0.5304659498207885
 * R: 107/125 ~ 0.856
 * D: 41/154 ~ 0.2662337662337662
 # MAGA: 0.5833333333333334
 # 2A: 0.8333333333333334
 # GunControl: 0.3333333333333333
 # SleepyJoe: 0.8846153846153846
 # VoteBlue: 0.26666666666666666
 # VoteRed: 0.8387096774193549
 # ProLife: 0.5
 # GreenNewDeal: 0.5
 # BlueNoMatterWho: 0.08333333333333333
 # Trump2020Landslide: 0.875


# 3. EDA

#### A. Get the post popular hashtags by party

In [40]:
def most_common_array_col_values(df, col, n):

    hashtags = []

    for tweet_hashtags in df[col].values:
        tweet_hashtags = tweet_hashtags[1:-1]
        tweet_hashtags = tweet_hashtags.replace("'", "")
        tweet_hashtags = tweet_hashtags.replace(" ", "")
        hashtags = hashtags + tweet_hashtags.split(",")

    counter=collections.Counter(hashtags)
    return counter.most_common(n)

In [41]:
r_most_common = most_common_array_col_values(r_tweets, 'hashtags', 25)
d_most_common = most_common_array_col_values(d_tweets, 'hashtags', 25)

In [42]:
r_most_common = [ r for r in r_most_common if r[0] != '' ]
d_most_common = [ d for d in d_most_common if d[0] != '' ]

In [43]:
common_hashtag_df = pd.DataFrame({
    'Hashtag': [ v[0] for v in r_most_common ] + [ v[0] for v in d_most_common ],
    'Uses': [ v[1] for v in r_most_common ] + [ v[1] for v in d_most_common ],
    'Party': [ 'R' for v in r_most_common ] + [ 'D' for v in d_most_common ]
})

In [44]:
common_hashtag_df.to_csv('tableau/common_hashtags.csv')

#### B. Get the most important n-grams

In [45]:
interesting = []

#interesting += [' '.join(n) for n in most_interesting_ngrams(1, 15)]
interesting += [' '.join(n) for n in most_interesting_ngrams(2, 100)]
#interesting += [' '.join(n) for n in most_interesting_ngrams(3, 10)]

ngramMap = {}

for term in interesting:
    ngramMap[term] = {
        'r': 0,
        'd': 0
    }

for index, row in tweets.iterrows():
    tweet = row['tweet']
    party = row['Party']
    for term in interesting:
        if term in tweet:
            if party == 0:
                ngramMap[term]['d'] += 1
            if party == 1:
                ngramMap[term]['r'] += 1
                
interesting_ngram_df = pd.DataFrame({
    'ngram': [k for k in ngramMap.keys()],
    'republican': [v['r'] for v in ngramMap.values()],
    'democratic': [v['d'] for v in ngramMap.values()]
})
   
print("n-gram map")
print("==============================================")
print(ngramMap)
print()
print("interesting_ngram_df")
print("==============================================")
print(interesting_ngram_df)   
                
interesting_ngram_df.to_csv("tableau/interesting_ngrams.csv")

n-gram map
{'town hall': {'r': 23, 'd': 84}, 'hous democrat': {'r': 31, 'd': 2}, 'im work': {'r': 2, 'd': 30}, 'mental health': {'r': 1, 'd': 27}, 'new job': {'r': 23, 'd': 1}, 'gun violenc': {'r': 0, 'd': 30}, 'speaker pelosi': {'r': 30, 'd': 0}, 'us safe': {'r': 20, 'd': 3}, 'proud stand': {'r': 1, 'd': 14}, 'pass bill': {'r': 5, 'd': 21}, 'sick leav': {'r': 0, 'd': 13}, 'twittercom im': {'r': 0, 'd': 0}, 'right now': {'r': 4, 'd': 15}, 'access health': {'r': 2, 'd': 13}, 'mobil offic': {'r': 4, 'd': 61}, 'care worker': {'r': 5, 'd': 19}, 'bipartisan legisl': {'r': 1, 'd': 16}, 'feder fund': {'r': 4, 'd': 18}, 'health insur': {'r': 0, 'd': 13}, 'thank @realdonaldtrump': {'r': 23, 'd': 0}, 'paid sick': {'r': 0, 'd': 13}, 'today im': {'r': 7, 'd': 17}, 'supplement fund': {'r': 8, 'd': 0}, 'pass bipartisan': {'r': 4, 'd': 24}, 'job creat': {'r': 17, 'd': 0}, 'work peopl': {'r': 3, 'd': 14}, 'health crisi': {'r': 1, 'd': 20}, 'dark money': {'r': 0, 'd': 10}, 'partisan impeach': {'r': 10,

#### Get most mentioned users by party

In [46]:
r_most_common_mentions = most_common_array_col_values(r_tweets, 'mentions', 10)
d_most_common_mentions = most_common_array_col_values(d_tweets, 'mentions', 10)

In [47]:
r_most_common_mentions

[('', 1804),
 ('realdonaldtrump', 228),
 ('potus', 72),
 ('cdcgov', 34),
 ('speakerpelosi', 22),
 ('sbagov', 21),
 ('housedemocrats', 19),
 ('usda', 16),
 ('senatorfischer', 15),
 ('newsandtribune', 12)]

In [48]:
d_most_common_mentions

[('', 2166),
 ('realdonaldtrump', 33),
 ('cdcgov', 26),
 ('oversightdems', 24),
 ('housedemocrats', 23),
 ('sbagov', 16),
 ('appropsdems', 14),
 ('energycommerce', 14),
 ('housejudiciary', 11),
 ('housesmallbiz', 10)]

#### Get the trends of different keywords

In [49]:
def trend_for_keywords(df, keywords):
    timeseries = {}
    
    for index, row in df.iterrows():
        date = row['date']
        tweet = row['tweet']
        
        found = False
        for kw in keywords:
            if kw in tweet:
                found = True
                break
        
        if found:
            if date in timeseries:
                timeseries[date] += 1
            else:
                timeseries[date] = 1
    
    return timeseries

In [50]:
gen_timeseries = trend_for_keywords(tweets, ['coronavirus', 'covid', 'pandemic'] )
rep_timeseries = trend_for_keywords(r_tweets, ['coronavirus', 'covid', 'pandemic'] )
dem_timeseries = trend_for_keywords(d_tweets, ['coronavirus', 'covid', 'pandemic'] )

In [51]:
timeseriesDf = pd.DataFrame({
    'date': [k for k in gen_timeseries.keys()],
    'gen_tweets': [v for v in gen_timeseries.values()],
    'rep_tweets': [(rep_timeseries[k] if k in rep_timeseries else 0) for k in gen_timeseries.keys()],
    'dem_tweets': [(dem_timeseries[k] if k in dem_timeseries else 0) for k in gen_timeseries.keys()]
})

In [52]:
timeseriesDf.to_csv('tableau/covid_tweet_frequency.csv')

In [53]:
timeseriesDf

Unnamed: 0,date,gen_tweets,rep_tweets,dem_tweets
0,2020-03-19,30,15,15
1,2020-03-18,20,9,11
2,2020-03-17,23,11,12
3,2020-03-09,8,4,4
4,2020-02-28,2,1,1
5,2020-04-04,8,2,6
6,2020-04-03,13,8,5
7,2020-04-02,19,8,11
8,2020-03-31,19,7,12
9,2020-03-29,9,4,5
