# Oversampling using smote

### Importing prerequisite libraries

In [105]:
import numpy as np
import pandas as pd
import imblearn
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_gaussian_quantiles

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

### Loading datasets and dropping nulls

In [81]:
data = pd.read_csv('dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

In [82]:
data.dropna(inplace=True)

In [83]:
data1.dropna(inplace=True)

In [84]:
data2.dropna(inplace=True)

In [91]:
data_X = data['Msg']
data_Y = data['Tag']
data1_X = data1['Msg']
data1_Y = data1['Tag']
data2_X = data2['Msg']
data2_Y = data2['Tag']

#Over sampling hate speech class


In [101]:
cv = CountVectorizer()
x_cv=cv.fit_transform(data_X.values.astype('U'))

print(Counter(data_Y))

oversample = SMOTE()
x, y = oversample.fit_sample(x_cv, data_Y)

print(Counter(y))

cv1 = CountVectorizer()
x1_cv=cv1.fit_transform(data1_X.values.astype('U'))

print(Counter(data1_Y))

oversample = SMOTE()
x1, y1 = oversample.fit_sample(x1_cv, data1_Y)

print(Counter(y1))

cv2 = CountVectorizer()
x2_cv=cv2.fit_transform(data2_X.values.astype('U'))

print(Counter(data2_Y))

oversample = SMOTE()
x2, y2 = oversample.fit_sample(x2_cv, data2_Y)

print(Counter(y2))


Counter({0: 9504, 1: 1434})




Counter({1: 9504, 0: 9504})
Counter({0: 9338, 1: 1426})
Counter({1: 9338, 0: 9338})
Counter({0: 9497, 1: 1434})
Counter({1: 9497, 0: 9497})


# SVM on oversampled data

In [95]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, random_state=4)


In [96]:
svm = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
svm1 = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
svm2 = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))

svm.fit(x_train,y_train)
svm1.fit(x1_train,y1_train)
svm2.fit(x2_train,y2_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [107]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
ans['Precision'] = []

prediction = svm.predict(x_test)
f1 = f1_score(y_test, prediction, average='weighted')
acc = accuracy_score(y_test, prediction)
rec = recall_score(y_test, prediction, average = 'macro')
pre = precision_score(y_test, prediction, average='macro')

ans['model'].append("Naive dataset")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)

prediction = svm1.predict(x1_test)
f1 = f1_score(y1_test, prediction, average='weighted')
acc = accuracy_score(y1_test, prediction)
rec = recall_score(y1_test, prediction, average = 'macro')
pre = precision_score(y1_test, prediction, average='macro')

ans['model'].append("Dataset with POS tag")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)

prediction = svm2.predict(x2_test)
f1 = f1_score(y2_test, prediction, average='weighted')
acc = accuracy_score(y2_test, prediction)
rec = recall_score(y2_test, prediction, average = 'macro')
pre = precision_score(y2_test, prediction, average='macro')

ans['model'].append("Stemmed dataset with POS tag")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)

In [108]:
confusion_matrix(y2_test, prediction)

array([[1345,  517],
       [ 245, 1692]])

In [99]:
import operator
final = pd.DataFrame(ans)
final

Unnamed: 0,model,F1-score,Recall,Accuracy,Precision
0,Naive dataset,0.820561,0.820633,0.821147,0.824876
1,Dataset with POS tag,0.800868,0.802533,0.801927,0.809116
2,Stemmed dataset with POS tag,0.798098,0.797929,0.799421,0.805935


# LR on oversampled data


In [102]:
lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
lr1 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
lr2 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')

lr.fit(x_train,y_train)
lr1.fit(x1_train,y1_train)
lr2.fit(x2_train,y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [103]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
ans['Precision'] = []

prediction = lr.predict(x_test)
f1 = f1_score(y_test, prediction, average='weighted')
acc = accuracy_score(y_test, prediction)
rec = recall_score(y_test, prediction, average = 'macro')
pre = precision_score(y_test, prediction, average='macro')

ans['model'].append("Naive dataset")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)

prediction = lr1.predict(x1_test)
f1 = f1_score(y1_test, prediction, average='weighted')
acc = accuracy_score(y1_test, prediction)
rec = recall_score(y1_test, prediction, average = 'macro')
pre = precision_score(y1_test, prediction, average='macro')

ans['model'].append("Dataset with POS tag")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)

prediction = lr2.predict(x2_test)
f1 = f1_score(y2_test, prediction, average='weighted')
acc = accuracy_score(y2_test, prediction)
rec = recall_score(y2_test, prediction, average = 'macro')
pre = precision_score(y2_test, prediction, average='macro')

ans['model'].append("Stemmed dataset with POS tag")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)

In [104]:
import operator
final = pd.DataFrame(ans)
final

Unnamed: 0,model,F1-score,Recall,Accuracy,Precision
0,Naive dataset,0.846852,0.846887,0.847449,0.852379
1,Dataset with POS tag,0.832485,0.833807,0.833244,0.839973
2,Stemmed dataset with POS tag,0.838055,0.837594,0.838905,0.844822
