# Oversampling using smote


### Importing prerequisite libraries

In [1]:
import numpy as np
import pandas as pd
import imblearn
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_gaussian_quantiles

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix



### Loading datasets and dropping nulls

In [6]:
data = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])


In [3]:
# data.dropna(inplace=True)

In [7]:
data_X = data['Msg']
data_Y = data['Tag']

#Over sampling hate speech class


In [8]:
cv = CountVectorizer()
x_cv=cv.fit_transform(data_X.values.astype('U'))

print(Counter(data_Y))

oversample = SMOTE()
x, y = oversample.fit_sample(x_cv, data_Y)

print(Counter(y))



Counter({1: 19190, 2: 4163, 0: 1430})




Counter({1: 19190, 2: 19190, 0: 19190})


# SVM on oversampled data

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)


In [10]:
svm = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))

svm.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [11]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
ans['Precision'] = []

prediction = svm.predict(x_test)
f1 = f1_score(y_test, prediction, average='weighted')
acc = accuracy_score(y_test, prediction)
rec = recall_score(y_test, prediction, average = 'macro')
pre = precision_score(y_test, prediction, average='macro')

ans['model'].append("Naive dataset")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)


In [12]:
confusion_matrix(y_test, prediction)

array([[3492,  206,  135],
       [ 577, 3171,  114],
       [1055,  614, 2150]])

In [13]:
import operator
final = pd.DataFrame(ans)
final

Unnamed: 0,model,F1-score,Recall,Accuracy,Precision
0,Naive dataset,0.759822,0.765029,0.765416,0.790748


# LR on oversampled data


In [14]:
lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')

lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
ans['Precision'] = []

prediction = lr.predict(x_test)
f1 = f1_score(y_test, prediction, average='weighted')
acc = accuracy_score(y_test, prediction)
rec = recall_score(y_test, prediction, average = 'macro')
pre = precision_score(y_test, prediction, average='macro')

ans['model'].append("Naive dataset")
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)
ans['Precision'].append(pre)


In [16]:
confusion_matrix(y_test, prediction)

array([[3372,  190,  271],
       [ 289, 3431,  142],
       [ 635,   78, 3106]])

In [17]:
import operator
final = pd.DataFrame(ans)
final

Unnamed: 0,model,F1-score,Recall,Accuracy,Precision
0,Naive dataset,0.861377,0.860477,0.860604,0.865034
