# Alexander Adams

# PPOL628 Text as Data

# Final Project Notebook

I scraped tweets from several accounts and concatenated them all into a single .csv file, called `tweets.csv`.

In [1]:
#!dvc pull

Everything is up to date.


ERROR: failed to pull data from the cloud - config file error: no remote specified. Setup default remote with
    dvc remote default <remote name>
or use:
    dvc pull -r <remote name>


In [1]:
from bertopic import BERTopic
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import yaml
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
pd.options.display.max_seq_items = None

In [56]:
tweets = pd.read_csv('data/tweets.csv')


Columns (9) have mixed types.Specify dtype option on import or set low_memory=False.



In [57]:
tweets = tweets.loc[tweets['language'] == 'en']

In [4]:
tweets.shape

(48249, 36)

In [5]:
tweets.dtypes

id                   int64
conversation_id      int64
created_at          object
date                object
time                object
timezone             int64
user_id              int64
username            object
name                object
place               object
tweet               object
language            object
mentions            object
urls                object
photos              object
replies_count        int64
retweets_count       int64
likes_count          int64
hashtags            object
cashtags            object
link                object
retweet               bool
quote_url           object
video                int64
thumbnail           object
near               float64
geo                float64
source             float64
user_rt_id         float64
user_rt            float64
retweet_id         float64
reply_to            object
retweet_date       float64
translate          float64
trans_src          float64
trans_dest         float64
dtype: object

In [6]:
tweets.head(5)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1508992328089833475,1508992326248460295,2022-03-29 22:19:41 Eastern Daylight Time,2022-03-29,22:19:41,-400,1081991142,mikedunleavygov,Mike Dunleavy,,Female athletes deserve to compete on an equal playing field. I'm calling on lawmakers to pass legislation that protects the fairness in women's sports in Alaska.,en,[],[],[],15,6,27,[],[],https://twitter.com/MikeDunleavyGov/status/1508992328089833475,False,,0,,,,,,,,[],,,,
1,1508992326248460295,1508992326248460295,2022-03-29 22:19:40 Eastern Daylight Time,2022-03-29,22:19:40,-400,1081991142,mikedunleavygov,Mike Dunleavy,,"As a father of three daughters, watching the deterioration of a girl's ability to participate in athletics is alarming. To have biological males competing against biological females nullifies the fair nature of competitive sports.",en,[],[],[],61,40,151,[],[],https://twitter.com/MikeDunleavyGov/status/1508992326248460295,False,,0,,,,,,,,[],,,,
2,1505915435623010308,1505915435623010308,2022-03-21 10:33:13 Eastern Daylight Time,2022-03-21,10:33:13,-400,1081991142,mikedunleavygov,Mike Dunleavy,,About to join @kilmeade on @foxnewsradio to talk about energy and how important it is for Alaska and America. Listen LIVE: https://t.co/tgU1JMMj1C,en,"[{'screen_name': 'kilmeade', 'name': 'brian kilmeade', 'id': '43919633'}, {'screen_name': 'foxnewsradio', 'name': 'fox news radio', 'id': '11611052'}]",['https://radio.foxnews.com/fox-news-talk/brian-kilmeade/'],[],0,0,5,[],[],https://twitter.com/MikeDunleavyGov/status/1505915435623010308,False,,0,,,,,,,,[],,,,
3,1504902701154545665,1504902701154545665,2022-03-18 15:28:58 Eastern Daylight Time,2022-03-18,15:28:58,-400,1081991142,mikedunleavygov,Mike Dunleavy,,The federal government needs to get out of the way and let us produce our resources. https://t.co/gkE1dmiC8O,en,[],['https://www.theepochtimes.com/let-alaska-double-oil-production-gov-dunleavy-urges-biden_4342791.html'],[],0,1,7,[],[],https://twitter.com/MikeDunleavyGov/status/1504902701154545665,False,,0,,,,,,,,[],,,,
4,1504132739058053126,1504132739058053126,2022-03-16 12:29:25 Eastern Daylight Time,2022-03-16,12:29:25,-400,1081991142,mikedunleavygov,Mike Dunleavy,,Not on my watch! https://t.co/7uL7XoOBZX,en,[],['https://www.dailymail.co.uk/news/article-10614915/Russian-state-TV-demands-REPARATIONS-sanctions-public-hangings-Ukraine.html'],[],5,2,10,[],[],https://twitter.com/MikeDunleavyGov/status/1504132739058053126,False,,0,,,,,,,,[],,,,


# Topic Modeling: What Do State-Level Elected Officials Tweet About?

In [19]:
topic_model = BERTopic.load('project_BERTopic')

In [20]:
topics_list = topic_model.get_topics()
len(topic_model.get_topics())

383

In [21]:
topic_model.visualize_topics(topics_list)

In [22]:
probs = topic_model.hdbscan_model.probabilities_
topics = topic_model._map_predictions(topic_model.hdbscan_model.labels_)

In [23]:
new_topics, new_probs = topic_model.reduce_topics(tweets['tweet'], topics, probs, nr_topics = 10)

In [24]:
topic_model.visualize_topics()

In [25]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,41563,-1_the_to_and_of
1,0,1874,0_vote_your_election_to
2,1,667,1_covid19_get_vaccine_vaccinated
3,2,663,2_veterans_who_the_we
4,3,551,3_gas_energy_tax_the
5,4,551,4_happy_birthday_day_thanksgiving
6,5,529,5_thank_you_your_for
7,6,525,6_read_more_the_of
8,7,454,7_ukraine_of_the_people
9,8,446,8_via_in_of_the


In [26]:
dynamic_topics = topic_model.topics_over_time(tweets['tweet'],
                                              new_topics, 
                                              tweets['date'])

In [27]:
topic_model.visualize_topics_over_time(dynamic_topics,
                                       topics=[0,1,2,3,4,5,6,7,8,9],
                                       width = 950)

I expected some of the topics to be cyclical or intermittent, but I am surprised at how clear the spikes are. Topic 0, with the top words "your vote ballot", spikes almost every november and is nonexistent the rest of the year. Topic 2, which is about veterans, exhibits similar patterns. Topic 7, which is about Ukraine, only appears starting in February 2022, and topic 1, which is about COVID-19, sees its biggest spikes during the winter of 2020-21 and the Omicron wave beginning in late 2021. In general, all of these topics spike in the winter, and occur barely if at all during the rest of the year.

___________

# Multiclass Classification: What can I predict using tweets?

For the bulk of this project, I chose to run several multiclass classification tasks, in order to identify what, if anything, could be predicted by these tweets. First, I tried to see if I could identify the state an official represents:

Task: Multiclass Classification (State)

Number of Classes: 50 (U.S. States)

Script: `multiclass_state.py`

DVC YAML Stage: `multiclass_state`

In [28]:
import joblib
import numpy as np
from sklearn.metrics import (confusion_matrix, multilabel_confusion_matrix, 
precision_recall_fscore_support, classification_report)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [29]:
#Load the trained multiclass pipeline
pipe = joblib.load('outputs/mc_state_pipe.pkl')

In [58]:
#Perform necessary data processing
states = pd.read_csv('data/elected_officials.csv')

states = states.melt(id_vars = ['State',
                                'StateAbbr',
                                'Name',
                                'Party',
                                'Inauguration',
                                'Title',
                                'office'],
                    value_vars = ['officialTwitter',
                                  'campaignTwitter',
                                  'othertwitter'],
                    var_name = 'account_type',
                    value_name = 'twitter')

states['twitter'] = states['twitter'].str.lower()

tweets = tweets.merge(states, left_on = 'username', right_on = 'twitter')

#Create numeric labels based on state names

#Merge labels into MTG data frame
labels = pd.DataFrame(tweets['State'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['state_label', 'State']
tweets = tweets.merge(labels, on = 'State')

In [31]:
#Select labels as targets
y = tweets['state_label']

#Select text columns as features
X = tweets["tweet"]

In [32]:
pipe.fit(X,y)

Pipeline(steps=[('preprocess',
                 TfidfVectorizer(max_df=0.8, min_df=5, ngram_range=(1, 2),
                                 stop_words='english')),
                ('LinearSVC', LinearSVC())])

In [33]:
y_pred = pipe.predict(X)

Rather than print out a 50x50 confusion matrix, I'm going to simplify the matrix to just a few columns:

    -state: the abbreviation for the state
    -correct: the number of correctly classified tweets for that state
    -incorrect: the number of incorrectly classified tweets for that state
    -errors: the labels which were applied incorrectly for each state

In [34]:
cm = confusion_matrix(y,y_pred)

In [35]:
state_cm = pd.DataFrame.from_dict({'state': pd.unique(tweets['StateAbbr']),
                                   'correct': np.diag(cm),
                                   'incorrect': cm.sum(1)-np.diag(cm),
                                   'total_tweets': cm.sum(1),
                                   'precision': np.diag(cm)/cm.sum(0),
                                   'recall': np.diag(cm)/cm.sum(1)})

In [36]:
cm = pd.DataFrame(cm)
cm.columns = pd.unique(tweets['StateAbbr'])
cm.index = pd.unique(tweets['StateAbbr'])

In [37]:
cols = cm.columns.values
mask = cm.gt(0.0).values
np.fill_diagonal(mask, False)
out = [cols[x].tolist() for x in mask]

In [38]:
state_cm['errors'] = out

In [39]:
state_cm

Unnamed: 0,state,correct,incorrect,total_tweets,precision,recall,errors
0,AK,225,1,226,0.982533,0.995575,[RI]
1,AL,1050,12,1062,0.990566,0.988701,"[CO, IA, KY, MA, NC, ND, OR, RI, VT]"
2,AR,1150,12,1162,0.987124,0.989673,"[CA, LA, ME, MI, MO, MT, NC, NY, OK, OR, TX]"
3,AZ,876,12,888,0.996587,0.986486,"[CA, CO, MA, MI, OH, OR, RI]"
4,CA,1352,18,1370,0.951443,0.986861,"[FL, IL, MA, ME, MN, MS, NC, NJ, NV, OH, RI, VT, WV]"
5,CO,1517,19,1536,0.926129,0.98763,"[AL, CA, IL, LA, MA, NC, NV, PA, RI, VT, WA]"
6,CT,1379,25,1404,0.997108,0.982194,"[AR, CA, CO, IL, IN, KY, MN, NC, NJ, NM, RI, UT, WA]"
7,DE,849,7,856,0.996479,0.991822,"[FL, LA, MD, ME, NV, NY, TX]"
8,FL,1264,12,1276,0.97156,0.990596,"[AL, CA, CO, LA, RI, VT]"
9,GA,862,10,872,0.990805,0.988532,"[AL, CO, FL, IA, MA, ME, MT, TX, UT]"


I don't see any real trends here in terms of geography. That suggests to me that, while the Linear Support Vector Classifier was effective most of the time (as evidenced by the uniformly high precision and recall scores), incorrect guesses were not informed by geography (i.e. for a tweet by an Ohio official, the classifier was not more likely to select another Midwestern state than a non-midwestern state). The one interesting pattern that is clear, however, is that Colorado and California appear in many of these error lists. California makes sense, since it is the largest state (and it is possible that officials in larger states tweet more than officials in smaller states because more happens in larger states). But Colorado is a mid-sized state; I am not sure why the classifier would be more likely to predict Colorado as the label than other states. 

________

Next, I tried to see if I could identify the office an official holds:

Task: Multiclass Classification (Political Office)

Number of Classes: 5 (Governor, Lieutenant Governor, Attorney General, Secretary of State, Treasurer)

Script: `multiclass_office.py`

DVC YAML Stage: `multiclass_office`

In [40]:
#Load the trained multiclass pipeline
pipe = joblib.load('outputs/mc_office_pipe.pkl')

In [59]:
labels = pd.DataFrame(tweets['office'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['office_label', 'office']
tweets = tweets.merge(labels, on = 'office')

In [42]:
#Select labels as targets
y = tweets['office_label']

#Select text columns as features
X = tweets["tweet"]

In [43]:
pipe.fit(X,y)

Pipeline(steps=[('preprocess',
                 TfidfVectorizer(max_df=0.8, min_df=5, ngram_range=(1, 2),
                                 stop_words='english')),
                ('LinearSVC', LinearSVC())])

In [44]:
y_pred = pipe.predict(X)

In [45]:
cm = pd.DataFrame(confusion_matrix(y,y_pred))
cm.columns = pd.unique(tweets['office'])
cm.index = pd.unique(tweets['office'])
cm

Unnamed: 0,Governor,LtGov,SecState,StateAG,Treasurer
Governor,14572,176,96,212,57
LtGov,483,7124,82,164,74
SecState,216,91,7481,132,57
StateAG,304,164,93,9859,72
Treasurer,238,103,64,118,6217


In [46]:
cm = confusion_matrix(y,y_pred)
office_cm = pd.DataFrame.from_dict({'office': pd.unique(tweets['office']),
                                   'correct': np.diag(cm),
                                   'incorrect': cm.sum(1)-np.diag(cm),
                                   'total_tweets': cm.sum(1),
                                   'precision': np.diag(cm)/cm.sum(0),
                                   'recall': np.diag(cm)/cm.sum(1)})

In [47]:
office_cm

Unnamed: 0,office,correct,incorrect,total_tweets,precision,recall
0,Governor,14572,541,15113,0.92152,0.964203
1,LtGov,7124,803,7927,0.930269,0.898701
2,SecState,7481,496,7977,0.957139,0.937821
3,StateAG,9859,633,10492,0.940296,0.939668
4,Treasurer,6217,523,6740,0.959858,0.922404


________

Next, I tried to see if I could identify the political party of an official:

Task: Binary Classification (Political Party)

Number of Classes: 2 (Democrat, Republican)

Script: `twoclass_party.py`

DVC YAML Stage: `twoclass_party`

In [48]:
#Load the trained multiclass pipeline
pipe = joblib.load('outputs/bc_party_pipe.pkl')

In [60]:
labels = pd.DataFrame(tweets['Party'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['party_label', 'Party']
tweets = tweets.merge(labels, on = 'Party')
partyclass = tweets.loc[tweets['Party'] != 'Independent']

In [61]:
#Select labels as targets
y = partyclass['party_label']

#Select text columns as features
X = partyclass["tweet"]

In [62]:
pipe.fit(X,y)

Pipeline(steps=[('preprocess',
                 TfidfVectorizer(max_df=0.8, min_df=5, ngram_range=(1, 2),
                                 stop_words='english')),
                ('LinearSVC', LinearSVC())])

In [63]:
y_pred = pipe.predict(X)

In [64]:
cm = pd.DataFrame(confusion_matrix(y,y_pred))
cm.columns = pd.unique(partyclass['Party'])
cm.index = pd.unique(partyclass['Party'])
cm

Unnamed: 0,Republican,Democratic
Republican,23148,717
Democratic,784,23544
