# Baseline ML models


In [79]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Reading datasets and creating test train split

In [80]:
data = pd.read_csv('dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [81]:
data_x=data["Msg"]
data_y=data["Tag"]
cv = CountVectorizer()
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=4)

data1_x=data1["Msg"]
data1_y=data1["Tag"]
cv = CountVectorizer()
x1_train, x1_test, y1_train, y1_test = train_test_split(data1_x, data1_y, test_size=0.2, random_state=4)

data2_x=data2["Msg"]
data2_y=data2["Tag"]
cv = CountVectorizer()
x2_train, x2_test, y2_train, y2_test = train_test_split(data2_x, data2_y, test_size=0.2, random_state=4)

x_train.head()

8752    Also stupid white women watch Oprah realize fa...
9141                     Jews insane literal demonic DNA 
5426                           I go I post soon possible 
2197    Hey live smithfield lived newport news jeffers...
8687    Manic street preachers also red scum basically...
Name: Msg, dtype: object

In [82]:
cv = CountVectorizer()
x_traincv = cv.fit_transform(["bayhdb hab ujhanvuz sniugnv","shnf nfuje test test","test anhf janbd whbj"])
x_traincv.toarray()
cv.get_feature_names()

['anhf',
 'bayhdb',
 'hab',
 'janbd',
 'nfuje',
 'shnf',
 'sniugnv',
 'test',
 'ujhanvuz',
 'whbj']

# Defining and fitting count vectorizer

In [83]:
cv1 = CountVectorizer()
x_traincv=cv1.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()
a[0]

cv11 = CountVectorizer()
x1_traincv=cv11.fit_transform(x1_train.values.astype('U'))
a1=x1_traincv.toarray()
a1[0]

cv12 = CountVectorizer()
x2_traincv=cv12.fit_transform(x2_train.values.astype('U'))
a2=x2_traincv.toarray()
a2[0]

array([0, 0, 0, ..., 0, 0, 0])

In [84]:
x_testcv=cv1.transform(x_test.values.astype('U'))
x_testcv.toarray()

x1_testcv=cv11.transform(x1_test.values.astype('U'))
x1_testcv.toarray()

x2_testcv=cv12.transform(x2_test.values.astype('U'))
x2_testcv.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 1) Naive Bayes classification model

In [85]:
mnb = MultinomialNB()
mnb1 = MultinomialNB()
mnb2 = MultinomialNB()

y_train=y_train.astype('int')
y_train

y1_train=y1_train.astype('int')
y1_train

y2_train=y2_train.astype('int')
y2_train

8752     0
9141     1
5426     0
2197     0
8687     1
        ..
6017     0
709      0
10679    0
8366     0
1146     0
Name: Tag, Length: 8755, dtype: int64

In [86]:
mnb.fit(x_traincv,y_train)
mnb1.fit(x1_traincv,y1_train)
mnb2.fit(x2_traincv,y2_train)



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [87]:
predictions=mnb.predict(x_testcv)
print(predictions)
a=np.array(y_test)

predictions1=mnb1.predict(x1_testcv)
print(predictions1)
a1=np.array(y1_test)

predictions2=mnb2.predict(x2_testcv)
print(predictions2)
a2=np.array(y2_test)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


# Calculating accuracies of the Naive Bayes model


In [88]:
count = 0
for i in range (len(predictions)):
    if predictions[i]==a[i]:
        count=count+1

count1 = 0
for i in range (len(predictions1)):
    if predictions1[i]==a1[i]:
        count1=count1+1

count2 = 0
for i in range (len(predictions2)):
    if predictions2[i]==a2[i]:
        count2=count2+1

In [89]:
print("Accuracy of Naive bayes with dataset")
print(count/len(predictions))
print("Accuracy of Naive bayes with POS tagged dataset")
print(count1/len(predictions1))
print("Accuracy of Naive bayes with stemmed and POS tagged dataset")
print(count2/len(predictions2))
print(" ")
print("F1 score of Naive bayes with dataset")
print(f1_score(a, predictions, average='macro'))
print("F1 score of Naive bayes with POS tagged dataset")
print(f1_score(a1, predictions1, average='macro'))
print("F1 score of Naive bayes with stemmed and POS tagged dataset")
print(f1_score(a2, predictions2, average='macro'))
print(" ")
print("Precision score of Naive bayes with dataset")
print(precision_score(a, predictions, average='macro'))
print("Precision score of Naive bayes with POS tagged dataset")
print(precision_score(a1, predictions1, average='macro'))
print("Pricision score of Naive bayes with stemmed and POS tagged dataset")
print(precision_score(a2, predictions2, average='macro'))
print(" ")
print("Recall score of Naive bayes with dataset")
print(recall_score(a, predictions, average='macro'))
print("Recall score of Naive bayes with POS tagged dataset")
print(recall_score(a1, predictions1, average='macro'))
print("Recall score of Naive bayes with stemmed and POS tagged dataset")
print(recall_score(a2, predictions2, average='macro'))

Accuracy of Naive bayes with dataset
0.8780264961169484
Accuracy of Naive bayes with POS tagged dataset
0.8793969849246231
Accuracy of Naive bayes with stemmed and POS tagged dataset
0.8784833257195066
 
F1 score of Naive bayes with dataset
0.5777016701529257
F1 score of Naive bayes with POS tagged dataset
0.5673913043478261
F1 score of Naive bayes with stemmed and POS tagged dataset
0.5566990687447849
 
Precision score of Naive bayes with dataset
0.7493289048637335
Precision score of Naive bayes with POS tagged dataset
0.7787907686439062
Pricision score of Naive bayes with stemmed and POS tagged dataset
0.7784954160254882
 
Recall score of Naive bayes with dataset
0.5613263501868887
Recall score of Naive bayes with POS tagged dataset
0.554525989092564
Recall score of Naive bayes with stemmed and POS tagged dataset
0.5479326603848191


# 2) SVM classification model

In [90]:
svm = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
svm1 = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
svm2 = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))

svm.fit(x_traincv,y_train)
svm1.fit(x1_traincv,y1_train)
svm2.fit(x2_traincv,y2_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [91]:
predictions=svm.predict(x_testcv)
print(predictions)
a=np.array(y_test)

predictions1=svm1.predict(x1_testcv)
print(predictions1)
a1=np.array(y1_test)

predictions2=svm2.predict(x2_testcv)
print(predictions2)
a2=np.array(y2_test)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [92]:
count = 0
for i in range (len(predictions)):
    if predictions[i]==a[i]:
        count=count+1

count1 = 0
for i in range (len(predictions1)):
    if predictions1[i]==a1[i]:
        count1=count1+1

count2 = 0
for i in range (len(predictions2)):
    if predictions2[i]==a2[i]:
        count2=count2+1

In [93]:
print("Accuracy of SVM with dataset")
print(count/len(predictions))
print("Accuracy of SVM with POS tagged dataset")
print(count1/len(predictions1))
print("Accuracy of SVM with stemmed and POS tagged dataset")
print(count2/len(predictions2))
print(" ")
print("Accuracy of SVM with dataset")
print(f1_score(a, predictions, average='macro'))
print("Accuracy of SVM with POS tagged dataset")
print(f1_score(a1, predictions1, average='macro'))
print("Accuracy of SVM with stemmed and POS tagged dataset")
print(f1_score(a2, predictions2, average='macro'))
print(" ")
print("Precision score of SVM with dataset")
print(precision_score(a, predictions, average='macro'))
print("Precision score of SVM with POS tagged dataset")
print(precision_score(a1, predictions1, average='macro'))
print("Pricision score of SVM with stemmed and POS tagged dataset")
print(precision_score(a2, predictions2, average='macro'))
print(" ")
print("Recall score of SVM with dataset")
print(recall_score(a, predictions, average='macro'))
print("Recall score of SVM with POS tagged dataset")
print(recall_score(a1, predictions1, average='macro'))
print("Recall score of SVM with stemmed and POS tagged dataset")
print(recall_score(a2, predictions2, average='macro'))

Accuracy of SVM with dataset
0.8725445408862494
Accuracy of SVM with POS tagged dataset
0.8725445408862494
Accuracy of SVM with stemmed and POS tagged dataset
0.8725445408862494
 
Accuracy of SVM with dataset
0.47644373577481447
Accuracy of SVM with POS tagged dataset
0.4730012037432532
Accuracy of SVM with stemmed and POS tagged dataset
0.47644373577481447
 
Precision score of SVM with dataset
0.8113844393592677
Precision score of SVM with POS tagged dataset
0.9362139917695473
Pricision score of SVM with stemmed and POS tagged dataset
0.8113844393592677
 
Recall score of SVM with dataset
0.5050760237844775
Recall score of SVM with POS tagged dataset
0.50355871886121
Recall score of SVM with stemmed and POS tagged dataset
0.5050760237844775


# 3) Logistic regression Classification model

In [94]:
lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
lr1 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
lr2 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')

lr.fit(x_traincv,y_train)
lr1.fit(x1_traincv,y1_train)
lr2.fit(x2_traincv,y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [95]:
predictions=lr.predict(x_testcv)
print(predictions)
a=np.array(y_test)

predictions1=lr1.predict(x1_testcv)
print(predictions1)
a1=np.array(y1_test)

predictions2=lr2.predict(x2_testcv)
print(predictions2)
a2=np.array(y2_test)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [96]:
count = 0
for i in range (len(predictions)):
    if predictions[i]==a[i]:
        count=count+1

count1 = 0
for i in range (len(predictions1)):
    if predictions1[i]==a1[i]:
        count1=count1+1

count2 = 0
for i in range (len(predictions2)):
    if predictions2[i]==a2[i]:
        count2=count2+1

In [97]:
print("Accuracy of LR with dataset")
print(count/len(predictions))
print("Accuracy of LR with POS tagged dataset")
print(count1/len(predictions1))
print("Accuracy of LR with stemmed and POS tagged dataset")
print(count2/len(predictions2))
print(" ")
print("Accuracy of LR with dataset")
print(f1_score(a, predictions, average='macro'))
print("Accuracy of LR with POS tagged dataset")
print(f1_score(a1, predictions1, average='macro'))
print("Accuracy of LR with stemmed and POS tagged dataset")
print(f1_score(a2, predictions2, average='macro'))
print(" ")
print("Precision score of LR with dataset")
print(precision_score(a, predictions, average='macro'))
print("Precision score of LR with POS tagged dataset")
print(precision_score(a1, predictions1, average='macro'))
print("Pricision score of LR with stemmed and POS tagged dataset")
print(precision_score(a2, predictions2, average='macro'))
print(" ")
print("Recall score of LR with dataset")
print(recall_score(a, predictions, average='macro'))
print("Recall score of LR with POS tagged dataset")
print(recall_score(a1, predictions1, average='macro'))
print("Recall score of LR with stemmed and POS tagged dataset")
print(recall_score(a2, predictions2, average='macro'))

Accuracy of LR with dataset
0.8766560073092736
Accuracy of LR with POS tagged dataset
0.8734582000913659
Accuracy of LR with stemmed and POS tagged dataset
0.8743718592964824
 
Accuracy of LR with dataset
0.6367095298977183
Accuracy of LR with POS tagged dataset
0.5960676159547006
Accuracy of LR with stemmed and POS tagged dataset
0.6191133324095499
 
Precision score of LR with dataset
0.7201042372243547
Precision score of LR with POS tagged dataset
0.7052683694713857
Pricision score of LR with stemmed and POS tagged dataset
0.7098678410432989
 
Recall score of LR with dataset
0.6090939442094347
Recall score of LR with POS tagged dataset
0.5753961592694554
Recall score of LR with stemmed and POS tagged dataset
0.59412792736334


# 4) Decision Trees Classification model

In [98]:
dt = tree.DecisionTreeClassifier()
dt1 = tree.DecisionTreeClassifier()
dt2 = tree.DecisionTreeClassifier()

dt.fit(x_traincv,y_train)
dt1.fit(x1_traincv,y1_train)
dt2.fit(x2_traincv,y2_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [99]:
predictions=dt.predict(x_testcv)
print(predictions)
a=np.array(y_test)

predictions1=dt1.predict(x1_testcv)
print(predictions1)
a1=np.array(y1_test)

predictions2=dt2.predict(x2_testcv)
print(predictions2)
a2=np.array(y2_test)

[1 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 1]


In [100]:
count = 0
for i in range (len(predictions)):
    if predictions[i]==a[i]:
        count=count+1

count1 = 0
for i in range (len(predictions1)):
    if predictions1[i]==a1[i]:
        count1=count1+1

count2 = 0
for i in range (len(predictions2)):
    if predictions2[i]==a2[i]:
        count2=count2+1

In [101]:
print("Accuracy of decision trees with dataset")
print(count/len(predictions))
print("Accuracy of decision trees with POS tagged dataset")
print(count1/len(predictions1))
print("Accuracy of decision trees with stemmed and POS tagged dataset")
print(count2/len(predictions2))
print(" ")
print("Accuracy of decision trees with dataset")
print(f1_score(a, predictions, average='macro'))
print("Accuracy of decision trees with POS tagged dataset")
print(f1_score(a1, predictions1, average='macro'))
print("Accuracy of decision trees with stemmed and POS tagged dataset")
print(f1_score(a2, predictions2, average='macro'))
print(" ")
print("Precision score of decision trees with dataset")
print(precision_score(a, predictions, average='macro'))
print("Precision score of decision trees with POS tagged dataset")
print(precision_score(a1, predictions1, average='macro'))
print("Pricision score of decision trees with stemmed and POS tagged dataset")
print(precision_score(a2, predictions2, average='macro'))
print(" ")
print("Recall score of decision trees with dataset")
print(recall_score(a, predictions, average='macro'))
print("Recall score of decision trees with POS tagged dataset")
print(recall_score(a1, predictions1, average='macro'))
print("Recall score of decision trees with stemmed and POS tagged dataset")
print(recall_score(a2, predictions2, average='macro'))

Accuracy of decision trees with dataset
0.8579259936043856
Accuracy of decision trees with POS tagged dataset
0.8579259936043856
Accuracy of decision trees with stemmed and POS tagged dataset
0.8542713567839196
 
Accuracy of decision trees with dataset
0.6394379549866136
Accuracy of decision trees with POS tagged dataset
0.596474256954484
Accuracy of decision trees with stemmed and POS tagged dataset
0.6247314965737432
 
Precision score of decision trees with dataset
0.6643687953770929
Precision score of decision trees with POS tagged dataset
0.6440535339515485
Pricision score of decision trees with stemmed and POS tagged dataset
0.6508532414471057
 
Recall score of decision trees with dataset
0.6241438931041429
Recall score of decision trees with POS tagged dataset
0.5801420503293866
Recall score of decision trees with stemmed and POS tagged dataset
0.6099090176593029
