In [1]:
from functools import reduce

import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string
import re

In [9]:
#Inspecting data
full_corpus = pd.read_csv('/Users/User/221project/data/party_data2.csv', header=None, names = ['label', 'tweet'], usecols = [1, 2], nrows=60000)
# print("Input data has {} rows and {} columns".format(len(full_corpus), len(full_corpus.columns)))
# print(full_corpus.info())
full_corpus

Unnamed: 0,label,tweet
0,dem,get the facts about romney-ryan and
1,dem,today i am celebrating the freedom of every am...
2,rep,don was a major difference maker in he will mo...
3,dem,today i am introducing the mommies act legisla...
4,dem,live from toledo watch hillary speak on her pl...
5,rep,shhh election-watching elites convinced crowds...
6,rep,u.s senators including senate maj ldr senator ...
7,rep,today we thank senator kirk for the impact he ...
8,rep,social media tech companies are no longer cens...
9,dem,when i was coming up i remember how badly i wa...


In [11]:

# Separating messages into dem and rep
dem_text = []
rep_text = []

def separate_msgs():
    for index, column in full_corpus.iterrows():
        label = column[0]
        message_text = column[1]
        if label == 'dem':
            dem_text.append(message_text)
        elif label == 'rep':
            rep_text.append(message_text)

separate_msgs()

# Preprocessing of text

#removing punctuation marks from the email messages
def remove_msg_punctuations(email_msg):
    puntuation_removed_msg = "".join([word for word in email_msg if word not in string.punctuation])
    return puntuation_removed_msg

#converting text into lowercase and word tokenizing
def tokenize_into_words(text):
    tokens = re.split('\W+', text)
    return tokens

#lemmatizing
word_lemmatizer = WordNetLemmatizer()
def lemmatization(tokenized_words):
    lemmatized_text = [word_lemmatizer.lemmatize(word)for word in tokenized_words]
    return ' '.join(lemmatized_text)

def preprocessing_msgs(corpus):
    categorized_text = pd.DataFrame(corpus)
    categorized_text['non_punc_message_body'] = categorized_text[0].apply(lambda msg: remove_msg_punctuations(msg))
    categorized_text['tokenized_msg_body'] = categorized_text['non_punc_message_body'].apply(lambda msg: tokenize_into_words(msg.lower()))
    categorized_text['lemmatized_msg_words'] = categorized_text['tokenized_msg_body'].apply(lambda word_list: lemmatization(word_list))
    return categorized_text['lemmatized_msg_words']

# Extracting features i.e. n-grams
def feature_extraction(preprocessed_text):
    bigrams = []
    unigrams_lists = []
    for msg in preprocessed_text:
        # adding end of and start of a message
        msg = '<s> ' +msg +' </s>'
        unigrams_lists.append(msg.split())
    unigrams = [uni_list for sub_list in unigrams_lists for uni_list in sub_list]
    bigrams.extend(nltk.bigrams(unigrams))
    return bigrams

# removing bigrams only with stop words
stopwords = nltk.corpus.stopwords.words('english')
def filter_stopwords_bigrams(bigram_list):
    filtered_bigrams = []
    for bigram in bigram_list:
        if bigram[0] in stopwords and bigram[1] in stopwords:
            continue
        filtered_bigrams.append(bigram)
    return filtered_bigrams

# Acquiring frequencies of features
def dem_bigram_feature_frequency():
    # features frequency for dem messages
    dem_bigrams = feature_extraction(preprocessing_msgs(dem_text))
    dem_bigram_frequency = nltk.FreqDist(filter_stopwords_bigrams(dem_bigrams))
    return dem_bigram_frequency

def rep_bigram_feature_frequency():
    # features frequency for rep messages
    rep_bigrams = feature_extraction(preprocessing_msgs(rep_text))
    rep_bigram_frequency = nltk.FreqDist(filter_stopwords_bigrams(rep_bigrams))
    return rep_bigram_frequency

# calculating bigram probabilities
def bigram_probability(message):
    probability_d = 1
    probability_r = 1
    # preprocessing input messages
    punc_removed_message = "".join(word for word in message if word not in string.punctuation)
    punc_removed_message = '<s> ' +punc_removed_message +' </s>'
    tokenized_msg = re.split('\s+', punc_removed_message)
    lemmatized_msg = [word_lemmatizer.lemmatize(word)for word in tokenized_msg]
    # bigrams for message
    bigrams_for_msg = list(nltk.bigrams(lemmatized_msg))
    # stop words removed unigrams for vocabulary
    dem_unigrams = [word for word in feature_extraction(preprocessing_msgs(dem_text)) if word not in stopwords]
    rep_unigrams = [word for word in feature_extraction(preprocessing_msgs(rep_text)) if word not in stopwords]
    # frequecies of bigrams extracted
    dem_frequency = dem_bigram_feature_frequency()
    rep_frequency  = rep_bigram_feature_frequency()
    print('========================== Calculating Probabilities ==========================')
    
    print('----------- Dem Freuquencies ------------')
    for bigram in bigrams_for_msg:
        # probability of first word in bigram
        dem_probability_denominator = 0
        # probability of bigram (smoothed)
        dem_probability_of_bigram = dem_frequency[bigram] + 1
        print(bigram, ' occurs ', dem_probability_of_bigram)
        for (first_unigram, second_unigram) in filter_stopwords_bigrams(dem_unigrams):
            dem_probability_denominator += 1
            if(first_unigram == bigram[0]):
                dem_probability_denominator += dem_frequency[first_unigram, second_unigram]
        probability = dem_probability_of_bigram / dem_probability_denominator
        probability_d *= probability
    print('\n')
    print('-----------  Freuquencies ------------')
    for bigram in bigrams_for_msg:
        # probability of first word in bigram
        rep_probability_denominator = 0
        # probability of bigram (smoothed)
        rep_probability_of_bigram = rep_frequency[bigram] + 1
        print(bigram, ' occurs ', rep_probability_of_bigram)
        for (first_unigram, second_unigram) in filter_stopwords_bigrams(rep_unigrams):
            rep_probability_denominator += 1
            if(first_unigram == bigram[0]):
                rep_probability_denominator += rep_frequency[first_unigram, second_unigram]
        probability = rep_probability_of_bigram / rep_probability_denominator
        probability_r *= probability
    print('\n')
    print('Dem Probability: ' +str(probability_d))
    print('Rep Probability: ' +str(probability_r))
    print('\n')
    if(probability_d >= probability_r):
        print('\"' +message +'\" is a Dem message')
    else:
        print('\"' +message +'\" is a Rep message')
    print('\n')
bigram_probability('Women should have access to abortion')
bigram_probability('Donald trump is the best president ever')


----------- Dem Freuquencies ------------
(('<s>', 'Women'), ' occurs ', 1)
(('Women', 'should'), ' occurs ', 1)
(('should', 'have'), ' occurs ', 1)
(('have', 'access'), ' occurs ', 49)
(('access', 'to'), ' occurs ', 426)
(('to', 'abortion'), ' occurs ', 30)
(('abortion', '</s>'), ' occurs ', 35)


-----------  Freuquencies ------------
(('<s>', 'Women'), ' occurs ', 1)
(('Women', 'should'), ' occurs ', 1)
(('should', 'have'), ' occurs ', 1)
(('have', 'access'), ' occurs ', 20)
(('access', 'to'), ' occurs ', 127)
(('to', 'abortion'), ' occurs ', 18)
(('abortion', '</s>'), ' occurs ', 51)


Dem Probability: 0
Rep Probability: 0


"Women should have access to abortion" is a Dem message


----------- Dem Freuquencies ------------
(('<s>', 'Donald'), ' occurs ', 1)
(('Donald', 'trump'), ' occurs ', 1)
(('trump', 'is'), ' occurs ', 228)
(('is', 'the'), ' occurs ', 1)
(('the', 'best'), ' occurs ', 175)
(('best', 'president'), ' occurs ', 1)
(('president', 'ever'), ' occurs ', 1)
(('ever', '<