In [1]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from collections import defaultdict
import itertools
import numpy as np

with open('stopwords.txt') as infile:
    stop_words = infile.readlines()
    stop_words.extend(['rt', '&amp;'])

In [2]:
#returns a dictionary sorted  descending by its values
def sort_dict(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

#creates a dictionary of dictionaries that iterates through texts, counting the amount of words in each tweet per 'handle' or user
#returns a dictionary of dicitonaries
def create_dict(df, handles):
    
    big_dict = {}
    #currently retweets are treated as tweets from the senator
    for handle in handles: 
        handle_dict = defaultdict(int)
        for tweet in df[df['handle'] == handle]['tokens']:
            for word in tweet:
                handle_dict[word] += 1
        big_dict[handle] = sort_dict(handle_dict)
    return big_dict

In [3]:
#returns only the top 'n' amount of entries in a sorted dictionary
def dict_slice(d, n):
    d_sort = sort_dict(d)
    return dict(itertools.islice(d_sort.items(), n))

#processes a dictionary of dictionaries into one dictionary with total counts of all words
#returns a dictionary
def get_all_words(d):
    t_dict = defaultdict(int)
    for handle in d.keys():
        for word in d[handle].keys():
            t_dict[word] += d[handle][word]
    return sort_dict(t_dict)

def bag(series, words):
    mat = np.zeros((len(df), len(words)+1))
    
    wordtoID = {}
    for i, word in enumerate(words):
        wordtoID[word.strip().lower()] = i
    
    for i, tweet in enumerate(series):
        for word in tweet:
            if word.strip().lower() in words:
                j = wordtoID[word.strip().lower()]
                mat[i][j] += 1
        if df['party'].iloc[i] == 'd':
            mat[i][-1] = 1
            
    return mat

In [4]:
#converts a string into tokens, removes specific types of words based on part of speech, then stems for each token
#returns a list of stems
ps = PorterStemmer()
def get_tokens(str):
    tweet = ''
    for word in str.split():
        if not word.startswith('http') and word.lower() not in stop_words:
            tweet += word + ' '
    tokens = word_tokenize(tweet)
    pos = pos_tag(tokens)
    include_tags = ['VBN', 'VBD', 'JJ', 'JJS', 'JJR', 'CD', 'NN', 'NNS', 'NNP', 'NNPS']
    filtered_tokens = [tok for tok, tag in pos if tag in include_tags]
    return [ps.stem(tok).lower() for tok in filtered_tokens if len(tok) > 1]

In [5]:
df = pd.read_csv('raw_sen.csv', index_col=[0]).reset_index(drop=True)

In [6]:
df['tokens'] = df['tweet'].apply(get_tokens)

In [7]:
sen_tokens_dict = create_dict(df, df['handle'].unique())

In [8]:
all_words_d = get_all_words(sen_tokens_dict)

In [9]:
all_words_d_5000 = dict_slice(all_words_d, 5000)

In [10]:
bag_of_words = bag(df['tokens'], all_words_d_5000.keys())

In [11]:
bag_of_words.shape

(135246, 5001)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bag_of_words[:,:-1], bag_of_words[:,-1], random_state=9)

In [13]:
##gaussian Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [15]:
print(f'Testing score: {gnb.score(X_test, y_test)}')
print(f'Training score: {gnb.score(X_train, y_train)}')

Testing score: 0.7814681178279901
Training score: 0.7933730307391998


In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
print(f'Testing score: {lr.score(X_test, y_test)}')
print(f'Training score: {lr.score(X_train, y_train)}')

Testing score: 0.8206849639181356
Training score: 0.8451209653567837


In [18]:
lr1 = LogisticRegression(penalty='l1')
lr1.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
print(f'Testing score: {lr1.score(X_test, y_test)}')
print(f'Training score: {lr1.score(X_train, y_train)}')

Testing score: 0.8219862770613984
Training score: 0.8448744996746653


In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_test, y_test)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [22]:
print(f'Testing score: {lda.score(X_test, y_test)}')
print(f'Training score: {lda.score(X_train, y_train)}')

Testing score: 0.8642789542174376
Training score: 0.7908787980361615
