In [232]:
import pandas as pd 
import numpy as np 
import nltk 
import string 
import re 

# Stopwords
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# Stem Words
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize 

# Lemmatize Words
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 

import math
import time

# Import Data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [233]:
tweets = pd.read_csv('ExtractedTweets.csv')
handles = pd.read_csv('TwitterHandles.csv')

In [234]:
tweets.shape

(86460, 3)

In [235]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


# Pre-analyze Data Statistics 

In [236]:
"Percent Democrat Tweets: %.1f%%" % (tweets[tweets['Party'] == 'Democrat'].shape[0] / tweets.shape[0] * 100)

'Percent Democrat Tweets: 48.7%'

# Data Pre-Processing

In [237]:
# https://machinelearningmastery.com/clean-text-machine-learning-python/
# lowercase
def text_lowercase(text): 
    return text.lower() 

# separate hashtags
def extract_hash_tags(s):
    return ' #'.join(list(set(part[1:] for part in s.split() if part.startswith('#'))))

# separate tags
def extract_tags(s):
    return ' @'.join(re.findall(r"@(\w+)", s))

def removeFrom(string, sub):
    for tag in sub.split():
        #print('\'' + tag + '\'')
        string = string.replace(tag, '')
    return string

# remove whitespace from text 
def remove_whitespace(text): 
    return  " ".join(text.split()) 

# remove punctuation 
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

# Remove numbers 
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result

# remove stopwords function 
def remove_stopwords(text): 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return ' '.join(filtered_text)

# stem words in the list of tokenised words 
stemmer = PorterStemmer() 
def stem_words(text): 
    word_tokens = word_tokenize(text) 
    stems = [stemmer.stem(word) for word in word_tokens] 
    return ' '.join(stems)

# lemmatize string 
lemmatizer = WordNetLemmatizer() 
def lemmatize_word(text): 
    word_tokens = word_tokenize(text) 
    # provide context i.e. part-of-speech 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
    return ' '.join(lemmas)

In [238]:
tweets['Hashtags'] = tweets.apply(lambda x: extract_hash_tags(x['Tweet']),axis=1) # extract hash tags

In [239]:
tweets['Tags'] = tweets.apply(lambda x: extract_tags(x['Tweet']),axis=1) # extract tags

In [240]:
# removes hashtags and tags from English
tweets['English'] = tweets.apply(lambda x: removeFrom(x['Tweet'], '#' + x['Hashtags'] + ' @' + x['Tags']),axis=1) 

In [241]:
# lowercase 
# remove punctuation 
tweets['Tags'] = tweets.apply(lambda x: remove_punctuation(text_lowercase(x['Tags'])),axis=1)

In [242]:
# lowercase 
# remove punctuation 
tweets['Hashtags'] = tweets.apply(lambda x: remove_punctuation(text_lowercase(x['Hashtags'])),axis=1)

In [243]:
# lowercase 
# remove punctuation 
# remove numbers

tweets['English'] = tweets.apply(lambda x: remove_numbers(remove_punctuation(text_lowercase(x['English']))),axis=1)
tweets['English'] = tweets.apply(lambda x: remove_stopwords(text_lowercase(x['English'])),axis=1)

In [244]:
tweets['Affiliation'] = tweets.apply(lambda x: 1 if x['Party'] == 'Democrat' else 0,axis=1)

In [245]:
# Collect Processed Data
tweets = tweets[['English', 'Tags', 'Hashtags', 'Affiliation']].copy()

In [246]:
# Remove Duplciates
print(tweets.shape)
tweets.drop_duplicates()
tweets.shape

(86460, 4)


(86460, 4)

In [247]:
tweets.head()

Unnamed: 0,English,Tags,Hashtags,Affiliation
0,today senate dems vote proud support similar l...,,netneutrality savetheinternet,1
1,rt winter resident alta vista teacher one seve...,winterhavensun repdarrensoto,,1
2,rt noted hurricane maria left approximately bi...,nbclatino repdarrensoto,,1
3,rt meeting thanks taking time meet ed marucci ...,nalcabpolicy repdarrensoto latinoleader,nalcabpolicy2018…,1
4,rt hurricane season starts june st puerto rico...,vegalteno pwr4puertorico repdarrensoto espaill...,,1


## Test

In [16]:
input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
text_lowercase(input_str) 

"hey, did you know that the summer break is coming? amazing right !! it's only 5 more days !!"

In [17]:
input_str = "There are 3 balls in this bag, and 12 in the other one."
remove_numbers(input_str) 

'There are  balls in this bag, and  in the other one.'

In [19]:
input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
remove_punctuation(input_str) 

'Hey did you know that the summer break is coming Amazing right  Its only 5 more days '

In [20]:
input_str = "   we don't need   the given questions"
remove_whitespace(input_str) 

"we don't need the given questions"

In [194]:
example_text = "This is a sample sentence and we are going to remove the stopwords from this."
remove_stopwords(example_text) 

'This sample sentence going remove stopwords .'

In [197]:
text = 'data science uses scientific methods algorithms and many types of processes'
stem_words(text) 

'data scienc use scientif method algorithm and mani type of process'

In [198]:
text = 'data science uses scientific methods algorithms and many types of processes'
lemmatize_word(text) 

'data science use scientific methods algorithms and many type of process'

# Bag-of-Words

In [249]:
import heapq
from sklearn.linear_model import LogisticRegression

In [250]:
vocab = list(set(' '.join(tweets['English'].values).split()))
len(vocab) * tweets.shape[0] # too large to store in memory

9708334020

In [251]:
def MostFreq(corpus):
    # Overall word frequency
    wordfreq = {}
    for sentence in corpus:
        tokens = sentence.split()
        for token in tokens:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1
    most_freq = heapq.nlargest(300, wordfreq, key=wordfreq.get)
    return most_freq
    
def Sent2Vec(mostFreq, corpus):
    sentence_vectors = []
    for sentence in corpus:
        sentence_tokens = sentence.split()
        sent_vec = []
        for token in most_freq:
            if token in sentence_tokens:
                sent_vec.append(1)
            else:
                sent_vec.append(0)
        sentence_vectors.append(sent_vec)
    return sentence_vectors

In [252]:
column = 'English'
most_freq = MostFreq(tweets[column].values) # mostly filler words
sentence_vectors = Sent2Vec(most_freq, tweets[column].values)
X = np.asarray(sentence_vectors)
y = np.asarray(tweets['Affiliation'].values)

## Logistic Regression 

In [226]:

#print(X.shape)
#print(y.shape)
clf = LogisticRegression(random_state=0).fit(X, y)
print(clf.score(X, y))

0.6347906546379829


In [None]:
# removing stop words improved accuracy from 61% to 62%

# TF-IDF

In [208]:
def tfidf (corpus):
    most_freq = MostFreq(corpus)
    W = np.zeros((len(most_freq), tweets.shape[0]))
    doc = [0] * tweets.shape[0]
    for i in range(len(most_freq)):
        if i % 200:
            print('hi')
        term = most_freq[i]
        for idx, tweet in tweets.iterrows():
            Tf = tweet[column].count(term)
            if Tf > 0:
                doc[idx] = 1
            W[i, idx] = Tf
        df = math.log(len(doc) / (sum(doc) + 1))
        W[i, :] *= df
    return W.T

start = time.time()
column = 'English'
X = tfidf(tweets[column].values)
print('Finish Time: {}'.format(time.time() - start))

hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
Finish Time: 1306.0875182151794


In [217]:
y = np.asarray(tweets['Affiliation'].values)

In [219]:
test = X.copy()

In [220]:
norm = np.linalg.norm(test)

In [222]:
test /= norm

In [231]:
test

array([[0.00334267, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00334267, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00334267, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.0022075 , ..., 0.        , 0.        ,
        0.        ]])

In [223]:
clf = LogisticRegression(random_state=0).fit(test, y)
print(clf.score(X, y))

0.5399722414989591


In [228]:
from sklearn.svm import LinearSVC

In [230]:
clf = LinearSVC(random_state=0).fit(test, y)
print(clf.score(test, y))

0.527087670599121


# Support Vector Machines

# Naive Bayes

# Cross Validation

# Post Analysis: Accuracy, Precision, Recall, and F1-score

# Sentiment Analysis
    # Republican vs Democrat 
    # Positive vs Negative