In [38]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from pandarallel import pandarallel
from sklearn.model_selection import cross_val_score
import sys
sys.path.insert(1, '/home/aw/projects/congress_twitter/scripts')
import text_processing
import spacy

# Input data

## Twitter handles

In [14]:
twitter_handles = pd.read_csv("../data/congress_twitter_handles.csv").drop("Unnamed: 0", axis = "columns")
twitter_handles.head()

Unnamed: 0,State,Chamber of Congress,Name,Name Links,Party,Twitter,Twitter Links,Instagram,Facebook Page,Facebook
0,Alabama,Senator,Richard Shelby,https://www.shelby.senate.gov/public/,R,@SenShelby,https://twitter.com/SenShelby?ref_src=twsrc%5E...,@senatorshelby,x,https://www.facebook.com/RichardShelby
1,Alabama,Senator,Doug Jones,https://www.jones.senate.gov/,D,@DougJones,https://twitter.com/DougJones?ref_src=twsrc%5E...,@dougjonesbama,x,https://www.facebook.com/senatordougjones/
2,Alabama 1st District,Representative,"Byrne, Bradley",https://byrne.house.gov/,R,@RepByrne,https://twitter.com/RepByrne,@repbyrne,x,https://www.facebook.com/RepByrne
3,Alabama 2nd District,U.S. Representative,"Roby, Martha",https://roby.house.gov/,R,@RepMarthaRoby,https://twitter.com/RepMarthaRoby,@martharoby,x,https://www.facebook.com/Representative.Martha...
4,Alabama 3rd District,U.S. Representative,"Rogers, Mike",http://mikerogers.house.gov/,R,@RepMikeRogersAL,https://twitter.com/RepMikeRogersAL,@repmikerogersal,x,https://www.facebook.com/pages/Mike-Rogers/640...


## All tweets

In [4]:
all_tweets = pd.read_csv("../output/all_tweets.csv")
all_tweets = all_tweets[["username", "date", "tweet_content", "party"]]
all_tweets.head()

Unnamed: 0,username,date,tweet_content,party
0,DougJones,2020-11-08 13:17:22+00:00,I’ve often heard Joe recite this poem and I am...,D
1,DougJones,2020-11-08 02:08:39+00:00,"Everyone, this is such a powerful moment. I ca...",D
2,DougJones,2020-11-07 22:23:31+00:00,"Congratulations to my friend of over 40 years,...",D
3,DougJones,2020-11-06 22:28:42+00:00,"Well folks, Tuesday didn’t turn out the way we...",D
4,DougJones,2020-11-03 16:21:59+00:00,It’s Re-Election Day folks! Thank you being t...,D


# Naive Bayes BOW

In [17]:
features = all_tweets["tweet_content"]
response = pd.get_dummies(all_tweets["party"])["D"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size=0.3, random_state = 42)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [21]:
clf = MultinomialNB()
scores = cross_val_score(clf, X_train, y_train, cv=5)

In [23]:
scores

array([0.82947062, 0.83021004, 0.82900848, 0.8274141 , 0.82787226])

In [24]:
clf.fit(X_train, y_train)

MultinomialNB()

In [25]:
accuracy = clf.score(X_test, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

The accuracy of the classifier on the test set is 0.833


# Naive Bayes BOW lemmas

In [55]:
nlp = spacy.load('en_core_web_lg')

stopwords = nlp.Defaults.stop_words

In [None]:
pandarallel.initialize()

all_tweets["tweet_lemmas"] = all_tweets["tweet_content"].parallel_apply(text_processing.preprocess)

In [56]:
features = all_tweets["tweet_lemmas"]
response = pd.get_dummies(all_tweets["party"])["D"]

In [63]:
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size=0.3, random_state = 42)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [64]:
clf = MultinomialNB()
scores = cross_val_score(clf, X_train, y_train, cv=5)

In [65]:
scores

array([0.79991681, 0.80017099, 0.7992005 , 0.79938535, 0.79785562])

In [44]:
clf.fit(X_train, y_train)

MultinomialNB()

In [45]:
accuracy = clf.score(X_test, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

The accuracy of the classifier on the test set is 0.803


# Random Forest BOW lemmas (max 400)

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
X_train, X_test, y_train, y_test = train_test_split(features, response, test_size=0.3, random_state = 42)
vectorizer = CountVectorizer(max_features=400)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [54]:
clf = RandomForestClassifier(n_estimators=50, n_jobs = 10)
scores = cross_val_score(clf, X_train, y_train, cv = 5)
scores

array([0.72139936, 0.72396423, 0.71934284, 0.72310927, 0.72259451])