In [None]:
# !pip install pandas
# !pip install gensim
# !pip install nltk
# !pip install num2words
# !pip install symspellpy
# nltk.download('punkt')
# !pip install inflect
# !pip install seaborn

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import pandas as pd
import pkg_resources
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from decimal import Decimal
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix

In [3]:
# # !pip install fasttext
import fasttext

In [4]:
df = pd.read_csv('preprocessed.csv')

# Word embedding using Fast Text.

I selected FastText is faster than word2vec and GloVe for training on large corpora because it uses a hierarchical softmax approach that reduces the complexity of the training process.

You may download the file here https://fasttext.cc/docs/en/crawl-vectors.html

In [None]:
# Unzip
!gunzip cc.es.300.bin.gz

In [5]:
model = fasttext.load_model('cc.es.300.bin')



In [6]:
def get_text_vector(sentence):
    return model.get_sentence_vector(sentence)

In [7]:
df['text_vectors'] = df['text'].apply(get_text_vector)

In [8]:
df.iloc[2:4]

Unnamed: 0,text,class,char_length,token_length,text_vectors
2,finally thousand hear thousand bad year swear ...,0,66,10,"[0.0036457763, 0.0013083905, 0.05972732, 0.079..."
3,need help help hard,1,33,6,"[-0.04169218, -0.02779561, 0.084972635, 0.0743..."


In [9]:
X = df['text_vectors']
y = df['class']

# Let's use Naive Bayes as our baseline model given its simplicity, before we try something more complex.

# 1) Naive Bayes

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
from sklearn.naive_bayes import GaussianNB

In [12]:
clf = GaussianNB()
clf.fit(list(X_train),y_train)

GaussianNB()

In [13]:
y_pred = clf.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.7918
 Precision 0.7576 
 Recall 0.8603 
 F1 0.8057


Our model performs decently well. Let's try other models.

# Let us see if our datapoints are linearly separable. If there is a line that can seperate the data points, then it is linear

We will be using Support Vector Machines to check.
If accuracy is 100% means it is lienarly seperable. Else, it is not.

# 2) LinearSVC

In [14]:
from sklearn.svm import LinearSVC

In [20]:
svm = LinearSVC(random_state=0)
svm.fit(list(X_train), y_train)
accuracy = svm.score(list(X_train), y_train)
# Check if the dataset is linearly separable (training)
print("Based on training")
if accuracy == 1.0:
    print("The dataset is linearly separable.")
else:
    print(accuracy)
    print("The dataset might not linearly separable.")

Based on training
0.8958408543517735
The dataset might not linearly separable.


In [21]:
accuracy = svm.score(list(X_test), y_test)
print("Test on test data")
if accuracy == 1.0:
    print("Linearly separable.")
else:
    print(accuracy)
    print("Not linearly separable.")

Test on test data
0.8933574723048407
Not linearly separable.


In [22]:
y_pred = svm.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.8934
 Precision 0.8776 
 Recall 0.9151 
 F1 0.896


We can note that LinearSVC still perform decently though. Try to finetune it.

In [107]:
from sklearn.model_selection import RandomizedSearchCV

In [108]:
param_grid = {'C': [0.01, 0.1, 1, 1000,10000,100000]}
clf=LinearSVC(random_state=0)
clf = RandomizedSearchCV(clf, param_grid, random_state=0, cv=5,n_jobs=-1)

In [109]:
clf.fit(list(X_train),y_train)



RandomizedSearchCV(cv=5, estimator=LinearSVC(random_state=0), n_jobs=-1,
                   param_distributions={'C': [0.01, 0.1, 1, 1000, 10000,
                                              100000]},
                   random_state=0)

In [111]:
clf.best_params_

{'C': 1}

In [18]:
svm = LinearSVC(C=1, random_state=0)
svm.fit(list(X_train), y_train)

LinearSVC(C=1, random_state=0)

In [23]:
y_pred = svm.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.8934
 Precision 0.8776 
 Recall 0.9151 
 F1 0.896


No difference after tuning

# Since we used a linear classifier, fine-tuned and still can't achieve 100%, there is a high chance that it is a Non-Linear Problem, hence we will explore using Non-Linear models

# 3) Decision Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
clf = DecisionTreeClassifier(random_state=0,max_depth=3)

In [26]:
clf.fit(list(X_train),y_train)

DecisionTreeClassifier(max_depth=3, random_state=0)

In [27]:
y_pred = clf.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.7899
 Precision 0.8265 
 Recall 0.7357 
 F1 0.7785


Try to tune Decision Tree

In [28]:
from sklearn.model_selection import RandomizedSearchCV
#GridSearch Takes Too Long

In [29]:
param_grid = {'max_depth': [3,5,7],'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 3]}
gs_clf=DecisionTreeClassifier(random_state=0)
clf = RandomizedSearchCV(gs_clf, param_grid, random_state=0, cv=5,n_jobs=-1)

In [30]:
clf.fit(list(X_train),y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
                   n_jobs=-1,
                   param_distributions={'max_depth': [3, 5, 7],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 4, 6]},
                   random_state=0)

In [31]:
clf.best_params_

{'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 7}

In [35]:
clf = DecisionTreeClassifier(random_state=0,max_depth=7,min_samples_split= 4, min_samples_leaf=1)
clf.fit(list(X_train),y_train)
y_pred = clf.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.8395
 Precision 0.8487 
 Recall 0.8278 
 F1 0.8381


# 4) Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

Using knowledge from decision trees, lets use the same params

In [37]:
clf = RandomForestClassifier(max_depth=7,min_samples_split= 4, min_samples_leaf=1, random_state=0)

In [38]:
clf.fit(list(X_train),y_train)

RandomForestClassifier(max_depth=7, min_samples_split=4, random_state=0)

In [39]:
y_pred = clf.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.8762
 Precision 0.8791 
 Recall 0.8735 
 F1 0.8763


Results looks promising. Let us finetune

In [42]:
param_grid = {'n_estimators':[100,200,300]}
gs_clf=RandomForestClassifier(max_depth=7,min_samples_split= 4, min_samples_leaf=1, random_state=0)
clf = RandomizedSearchCV(gs_clf, param_grid, cv=5,n_jobs=-1, random_state=0)

In [43]:
clf.fit(list(X_train),y_train)



RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(max_depth=7,
                                                    min_samples_split=4,
                                                    random_state=0),
                   n_jobs=-1,
                   param_distributions={'n_estimators': [100, 200, 300]},
                   random_state=0)

In [45]:
clf.best_params_

{'n_estimators': 300}

In [47]:
clf = RandomForestClassifier(n_estimators=300,max_depth=7,min_samples_split= 4, min_samples_leaf=1, random_state=0)
clf.fit(list(X_train),y_train)

RandomForestClassifier(max_depth=7, min_samples_split=4, n_estimators=300,
                       random_state=0)

In [48]:
y_pred = clf.predict(list(X_test))
#Confusion Matrix
clf_tn, clf_fp, clf_fn, clf_tp = confusion_matrix(y_test, y_pred).ravel()
#measurements
clf_accuracy = (clf_tp+clf_tn)/(clf_tp+clf_tn+clf_fp+clf_fn)
print(" Accuracy", round(clf_accuracy,4))
clf_precision = clf_tp/(clf_tp+clf_fp)
clf_recall = clf_tp/(clf_tp+clf_fn)
clf_f1_score = (2*clf_precision)*(clf_recall)/(clf_precision+clf_recall)
print(" Precision",round(clf_precision,4),"\n","Recall",round(clf_recall,4),"\n","F1",round(clf_f1_score,4))

 Accuracy 0.8769
 Precision 0.8799 
 Recall 0.8741 
 F1 0.877
