In [None]:
pip install --upgrade nltk

# Part 1

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re
import string
import math

import warnings
warnings.filterwarnings("ignore")

In [2]:
col_names = ["target", "ids", "date", "flag", "user", "text"]
data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',
            names = col_names,
            encoding = "ISO-8859-1")
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
data.shape

(1600000, 6)

In [4]:
target_df = data['target'].value_counts()
target_df

target
0    800000
4    800000
Name: count, dtype: int64

In [5]:
data.target = data.target.map({0 : 0,
                               4 : 1})

In [6]:
data0 = data[data['target'] == 0].sample(n=5000)
data1 = data[data['target'] == 1].sample(n=5000)

In [7]:
data = pd.concat([data0, data1])
data = data.drop(['ids', 'date', 'flag','user'], axis = 1)

In [8]:
X, y  = data['text'].tolist(), data['target'].to_numpy()

In [9]:
def preprocess_twitter(text):
    # make text lowercase
    text = str(text).lower()
    
    # remove stop words
    stopwords_list = stopwords.words('english')
    STOPWORDS = set(stopwords_list)
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    
    # remove emails
    text = re.sub('@[^\s]+', ' ', text)
    
    # remove urls
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    
    # remove numbers
    text = re.sub('[0-9]+', '', text)
    
    # remove punctuations
    english_punctuations = string.punctuation
    translator = str.maketrans('', '', english_punctuations)
    return text.translate(translator)

In [10]:
def stemming(docs):
    ps = PorterStemmer()

    stemmed_docs = []

    for sentence in docs:
        words = word_tokenize(sentence)

        # using reduce to apply stemmer to each word and join them back into a string
        stemmed_sentence = [ps.stem(w) for w in words]

        stemmed_docs.append(stemmed_sentence)
    return stemmed_docs

In [76]:
X = [preprocess_twitter(text) for text in X]

In [77]:
X = stemming(X)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Part 2

In [79]:
Unknown_token = "<UNK>"
def find_vocab(docs):
    vocab = set()
    for doc in docs:
        s = set(doc)
        vocab = vocab.union(s)
    vocab.add(Unknown_token) 
    return vocab

In [80]:
def count_dict(doc, vocab):
    count_words = dict.fromkeys(vocab, 0)
    for word in doc:
        count_words[word]+=1
    return count_words

def find_term_freq_matrix(docs, vocab):
    term_freq = []
    for doc in docs:
        count_words = count_dict(doc, vocab)
        term_freq.append(count_words)
    
    return pd.DataFrame(term_freq)

In [81]:
train_vocab = find_vocab(X_train)

In [82]:
train_term_freq_df = find_term_freq_matrix(X_train, train_vocab)

In [83]:
train_term_freq_df

Unnamed: 0,dooooooooooooooooooooooooooo,clubi,radiu,thk,scene,ole,wthe,rochest,academyquot,christin,...,carolina,impromtu,shatter,badminton,beachim,regal,obviou,smilequot,twitpic,indien
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
# replace unknown words in test data with <UNK>
for doc in X_test:
    i = 0
    while i < len(doc):
        word = doc[i]
        if word not in train_vocab:
            doc.remove(word)
            doc.append(Unknown_token)
            i-=1
        i+=1

In [86]:
test_term_freq_df = find_term_freq_matrix(X_test, train_vocab)

# Part 3

In [90]:
def find_word_count(docs, vocab):
    word_count = dict.fromkeys(vocab, 0)
    for word in vocab:
        for doc in docs:
            if word in doc:
                word_count[word] += 1
    return word_count

In [91]:
#Term Frequency
def termfreq(doc, word):
    return 1+math.log10(doc[word]) if doc[word] > 0 else 0

In [97]:
#Inverse Document Frequency
def inverse_doc_freq(N, word_count, word):
    df = word_count[word]
    if df == 0:
        return 0
    return math.log10(N/df)

In [102]:
def find_tf_idf_matrix(term_freq_df, word_count):
    tf_idf_mat = []
    N = len(term_freq_df)
    progress_bar = tqdm(N, total=N, position=0, leave=True)
    
    for index, row in term_freq_df.iterrows():
        tf_idf_doc = dict.fromkeys(term_freq_df.columns.values, 0)
        
        for word in term_freq_df.columns.values:
            tf = termfreq(row, word)
            idf = inverse_doc_freq(N, word_count, word)
            tf_idf_doc[word] = tf*idf
            
        tf_idf_mat.append(tf_idf_doc)
        progress_bar.update(1)
        
    return pd.DataFrame(tf_idf_mat)

In [103]:
train_word_count = find_word_count(X_train, train_vocab)

In [104]:
test_word_count = find_word_count(X_test, train_vocab)

In [105]:
train_tf_idf = find_tf_idf_matrix(train_term_freq_df, train_word_count)

100%|██████████| 8000/8000 [11:13<00:00, 11.88it/s]


In [106]:
test_tf_idf = find_tf_idf_matrix(test_term_freq_df, test_word_count)

100%|██████████| 2000/2000 [02:50<00:00, 11.76it/s]


In [107]:
train_tf_idf

Unnamed: 0,dooooooooooooooooooooooooooo,clubi,radiu,thk,scene,ole,wthe,rochest,academyquot,christin,...,carolina,impromtu,shatter,badminton,beachim,regal,obviou,smilequot,twitpic,indien
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Part 4

In [111]:
def find_ppmi_matrix(term_freq_df, word_count):
    ppmi_mat = []
    total_words = sum(term_freq_df.sum())
    N = len(term_freq_df)
    progress_bar = tqdm(N, total=N, position=0, leave=True)
    
    for index, row in term_freq_df.iterrows():
        ppmi_doc = dict.fromkeys(term_freq_df.columns.values, 0)
        
        for word in term_freq_df.columns.values:
            pwc = row[word]/total_words
            pw = word_count[word]/total_words
            pc = row.sum()/total_words
            t = pwc/(pw*pc)
            ppmi = 0
            if t != 0:
                ppmi = max(0, math.log2(t))
            ppmi_doc[word] = ppmi
            
        ppmi_mat.append(ppmi_doc)
        progress_bar.update(1)
        
    return pd.DataFrame(ppmi_mat)

In [112]:
train_ppmi = find_ppmi_matrix(train_term_freq_df, train_word_count)

100%|██████████| 8000/8000 [1:28:41<00:00,  1.50it/s]


In [113]:
train_ppmi

Unnamed: 0,dooooooooooooooooooooooooooo,clubi,radiu,thk,scene,ole,wthe,rochest,academyquot,christin,...,carolina,impromtu,shatter,badminton,beachim,regal,obviou,smilequot,twitpic,indien
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
test_ppmi = find_ppmi_matrix(test_term_freq_df, test_word_count)

100%|██████████| 2000/2000 [53:26<00:00,  1.60s/it]


# Part 5

In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## with term frequency embedding

In [87]:
mng = MultinomialNB()
y_pred = mng.fit(train_term_freq_df, y_train).predict(test_term_freq_df)

In [88]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [89]:
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1 score: {f1}')

precision: 0.7157784743991641
recall: 0.6933198380566802
f1 score: 0.7043701799485862


## with Tf-Idf embedding

In [108]:
mng_tf_idf = MultinomialNB()
y_pred_tf_idf = mng_tf_idf.fit(train_tf_idf, y_train).predict(test_tf_idf)

In [115]:
precision = precision_score(y_test, y_pred_tf_idf)
recall = recall_score(y_test, y_pred_tf_idf)
f1 = f1_score(y_test, y_pred_tf_idf)

In [116]:
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1 score: {f1}')

precision: 0.6909469302809573
recall: 0.6720647773279352
f1 score: 0.681375064135454


## with ppmi embedding

In [117]:
mng_ppmi = MultinomialNB()
y_pred_ppmi = mng_ppmi.fit(train_ppmi, y_train).predict(test_ppmi)

In [118]:
precision = precision_score(y_test, y_pred_ppmi)
recall = recall_score(y_test, y_pred_ppmi)
f1 = f1_score(y_test, y_pred_ppmi)

In [119]:
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1 score: {f1}')

precision: 0.6892596454640251
recall: 0.6690283400809717
f1 score: 0.6789933230611197
