In [3]:
from collections import defaultdict
import pandas as pd
import numpy as np
import io
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
# nltk.download('all') # Download only required or all -> press d, type all, type quit after
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords # not using these stopwords, still may be useful in the future
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

In [4]:
train_path = "content/training.csv"
val_path = "content/validation.csv"
test_path = "content/test.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

In [5]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]  

In [6]:
train_df['tokenized'] = train_df.apply(lambda x: tokenize(x['text']), axis=1)
val_df['tokenized'] = val_df.apply(lambda x: tokenize(x['text']), axis=1)
test_df['tokenized'] = test_df.apply(lambda x: tokenize(x['text']), axis=1)

In [7]:
lmtzr = WordNetLemmatizer()
train_df['lemmatized'] = train_df['tokenized'].apply(
                    lambda lst:[lmtzr.lemmatize(word) for word in lst])
val_df['lemmatized'] = val_df['tokenized'].apply(
                   lambda lst:[lmtzr.lemmatize(word) for word in lst])
test_df['lemmatized'] = test_df['tokenized'].apply(
                   lambda lst:[lmtzr.lemmatize(word) for word in lst])

print(train_df['lemmatized'])
print(val_df['lemmatized'])
print(test_df['lemmatized'])

0                             [i, didnt, feel, humiliated]
1        [i, can, go, from, feeling, so, hopeless, to, ...
2        [im, grabbing, a, minute, to, post, i, feel, g...
3        [i, am, ever, feeling, nostalgic, about, the, ...
4                                [i, am, feeling, grouchy]
                               ...                        
15995    [i, just, had, a, very, brief, time, in, the, ...
15996    [i, am, now, turning, and, i, feel, pathetic, ...
15997                [i, feel, strong, and, good, overall]
15998    [i, feel, like, this, wa, such, a, rude, comme...
15999    [i, know, a, lot, but, i, feel, so, stupid, be...
Name: lemmatized, Length: 16000, dtype: object
0       [im, feeling, quite, sad, and, sorry, for, mys...
1       [i, feel, like, i, am, still, looking, at, a, ...
2                   [i, feel, like, a, faithful, servant]
3               [i, am, just, feeling, cranky, and, blue]
4       [i, can, have, for, a, treat, or, if, i, am, f...
              

In [8]:
train_labels = np.array(train_df['label'])
val_labels = np.array(val_df['label'])
test_labels = np.array(test_df['label'])

In [9]:
Encoder = LabelEncoder()
train_labels = Encoder.fit_transform(train_labels)
val_labels = Encoder.fit_transform(val_labels)
test_labels = Encoder.fit_transform(test_labels)

In [20]:
stop = ['a', 'an', 'the', 'and', 'is', 'are', 'am', 'for', 'in', 'of', 'at', 'on']
Tfidf_vect = TfidfVectorizer(max_features=5000, stop_words=stop)
train_X_Tfidf = Tfidf_vect.fit_transform(train_df['lemmatized'].apply(lambda x: ' '.join(x)))
test_X_Tfidf = Tfidf_vect.transform(test_df['lemmatized'].apply(lambda x: ' '.join(x)))
val_X_Tfidf = Tfidf_vect.transform(val_df['lemmatized'].apply(lambda x: ' '.join(x)))

In [21]:
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=2**1)
SVM.fit(train_X_Tfidf, train_labels)
predictions = SVM.predict(test_X_Tfidf)

In [22]:
print("Accuracy Score -> ", accuracy_score(predictions, test_labels)*100)
confusion_matrix(y_true=test_labels, y_pred=predictions)

Accuracy Score ->  76.25


array([[525,  48,   0,   5,   3,   0],
       [ 11, 675,   7,   0,   1,   1],
       [ 17,  97,  43,   2,   0,   0],
       [ 46,  83,   0, 145,   1,   0],
       [ 39,  56,   0,   5, 124,   0],
       [ 10,  29,   0,   0,  14,  13]], dtype=int64)