In [1]:
from google.colab import files
from collections import defaultdict
import pandas as pd
import numpy as np
import io
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
#nltk.download () # Download all -> press d, type all, type quit after
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
train_path = "/content/training.csv"
val_path = "/content/validation.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

In [3]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]  

In [4]:
#stop = stopwords.words('english')

#train_df['without_stopwords'] = train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#val_df['without_stopwords'] = val_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [5]:
train_df['tokenized'] = train_df.apply(lambda x: tokenize(x['text']), axis=1)
val_df['tokenized'] = val_df.apply(lambda x: tokenize(x['text']), axis=1)

In [6]:
lmtzr = WordNetLemmatizer()
train_df['lemmatized'] = train_df['tokenized'].apply(
                    lambda lst:[lmtzr.lemmatize(word) for word in lst])
val_df['lemmatized'] = val_df['tokenized'].apply(
                   lambda lst:[lmtzr.lemmatize(word) for word in lst])

print(train_df['lemmatized'])
print(val_df['lemmatized'])


0                             [i, didnt, feel, humiliated]
1        [i, can, go, from, feeling, so, hopeless, to, ...
2        [im, grabbing, a, minute, to, post, i, feel, g...
3        [i, am, ever, feeling, nostalgic, about, the, ...
4                                [i, am, feeling, grouchy]
                               ...                        
15995    [i, just, had, a, very, brief, time, in, the, ...
15996    [i, am, now, turning, and, i, feel, pathetic, ...
15997                [i, feel, strong, and, good, overall]
15998    [i, feel, like, this, wa, such, a, rude, comme...
15999    [i, know, a, lot, but, i, feel, so, stupid, be...
Name: lemmatized, Length: 16000, dtype: object
0       [im, feeling, quite, sad, and, sorry, for, mys...
1       [i, feel, like, i, am, still, looking, at, a, ...
2                   [i, feel, like, a, faithful, servant]
3               [i, am, just, feeling, cranky, and, blue]
4       [i, can, have, for, a, treat, or, if, i, am, f...
              

Stemming

In [7]:
stemmer = SnowballStemmer("english")
train_df['lemmatized'] = train_df.lemmatized.map(lambda l: [stemmer.stem(word) for word in l])
train_df.lemmatized = train_df.lemmatized.str.join(sep=' ')

val_df['lemmatized'] = val_df.lemmatized.map(lambda l: [stemmer.stem(word) for word in l])
val_df.lemmatized = val_df.lemmatized.str.join(sep=' ')

display(train_df["lemmatized"])
display(val_df["lemmatized"])

0                                      i didnt feel humili
1        i can go from feel so hopeless to so damn hope...
2              im grab a minut to post i feel greedi wrong
3        i am ever feel nostalg about the fireplac i wi...
4                                        i am feel grouchi
                               ...                        
15995    i just had a veri brief time in the beanbag an...
15996    i am now turn and i feel pathet that i am stil...
15997                        i feel strong and good overal
15998    i feel like this wa such a rude comment and im...
15999    i know a lot but i feel so stupid becaus i can...
Name: lemmatized, Length: 16000, dtype: object

0       im feel quit sad and sorri for myself but ill ...
1       i feel like i am still look at a blank canva b...
2                             i feel like a faith servant
3                          i am just feel cranki and blue
4           i can have for a treat or if i am feel festiv
                              ...                        
1995    im have ssa examin tomorrow in the morn im qui...
1996    i constant worri about their fight against nat...
1997    i feel it import to share this info for those ...
1998    i truli feel that if you are passion enough ab...
1999    i feel like i just wan na buy ani cute make up...
Name: lemmatized, Length: 2000, dtype: object

Preprocessing...

In [8]:
cv = CountVectorizer(stop_words='english')
train_ppd_df = cv.fit_transform(train_df["lemmatized"])
val_ppd_df = cv.transform(val_df["lemmatized"])


display(train_ppd_df)
display(val_ppd_df)

<16000x10082 sparse matrix of type '<class 'numpy.int64'>'
	with 136700 stored elements in Compressed Sparse Row format>

<2000x10082 sparse matrix of type '<class 'numpy.int64'>'
	with 16253 stored elements in Compressed Sparse Row format>

In [9]:
val_labels = np.array(val_df['label'])
train_labels = np.array(train_df['label'])

#trainX,testX,trainY,testY = train_test_split(train_ppd_df,train_df.label)
#display(testX)

In [10]:

mnb = MultinomialNB()
mnb.fit(train_ppd_df,train_labels)
predictions_NB = mnb.predict(val_ppd_df)

#mnb.fit(trainX,trainY)
#predictions_NB = mnb.predict(testX)

print("Accuracy Score -> ",accuracy_score(predictions_NB, val_labels)*100)
confusion_matrix(y_true=val_labels, y_pred=predictions_NB)

Accuracy Score ->  78.4


array([[512,  25,   0,   6,   6,   1],
       [ 40, 647,   5,   8,   4,   0],
       [ 29,  77,  68,   3,   1,   0],
       [ 45,  33,   2, 188,   7,   0],
       [ 35,  24,   0,   7, 144,   2],
       [ 29,  32,   1,   1,   9,   9]])