In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
import numpy as np

In [2]:
data = pd.read_csv("master_dataset/processed_data.csv")
list(data.columns)
# drop unwanted features

# Dropped all categorical data 

data = data.drop(['title', 'text', 'text_without_stopwords', 'title_without_stopwords','syllables', 'polarity_category', 'overall_content', 'polarity_category_Neutral',
 'polarity_category_Positive'], axis=1)
#'Topic 1 Probability', 'Topic 2 Probability', 'Topic 3 Probbility' , 'Topic 4 Probability' ,'Topic 5 Probability',
#'title_word_count', 'title_sentence_count', 'title_average_word_length','title_punctuation_count', 'title_stopwords_count'  
# 'polarity_category_Neutral' , 'polarity_category_Positive'

In [3]:
#dataset is slightly imbalanced so we will perform upsampling to balance the dataset.
data['class'].value_counts()

0    21196
1    17462
Name: class, dtype: int64

In [4]:
list(data.columns)

['class',
 'text_word_count',
 'title_word_count',
 'text_sentence_count',
 'title_sentence_count',
 'text_average_word_length',
 'title_average_word_length',
 'text_punctuation_count',
 'title_punctuation_count',
 'text_stopwords_count',
 'title_stopwords_count',
 'flesch_readability',
 'subjectivity',
 'polarity',
 'Topic 1 Probability',
 'Topic 2 Probability',
 'Topic 3 Probbility',
 'Topic 4 Probability',
 'Topic 5 Probability']

In [5]:
data.describe()

Unnamed: 0,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
count,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0
mean,0.451705,411.374515,12.161209,15.05083,1.067877,5.138347,5.592058,53.119147,1.715635,155.440685,1.136789,0.006674,0.402753,0.056997,0.190953,0.324903,0.125233,0.18536,0.173552
std,0.497669,322.881353,3.765686,11.985707,0.270277,1.875575,1.430398,63.542364,1.452448,123.275458,1.295738,1.003826,0.124806,0.10517,0.281023,0.312565,0.210573,0.258829,0.234861
min,0.0,1.0,1.0,1.0,1.0,2.25,3.0,0.0,0.0,0.0,0.0,-83.616811,0.0,-1.0,0.000376,0.000246,0.00012,0.000303,0.000268
25%,0.0,220.0,10.0,7.0,1.0,4.866522,5.0,26.0,1.0,80.0,0.0,-0.169506,0.337127,0.0,0.002949,0.010969,0.001305,0.003113,0.003024
50%,0.0,376.0,11.0,13.0,1.0,5.075727,5.5,45.0,2.0,142.0,1.0,0.050549,0.405745,0.054757,0.030471,0.231605,0.006765,0.056382,0.056373
75%,1.0,522.0,14.0,20.0,1.0,5.272987,6.066667,67.0,3.0,201.0,2.0,0.267479,0.475,0.108144,0.282926,0.584998,0.157612,0.272065,0.268291
max,1.0,8436.0,45.0,321.0,4.0,149.0,149.0,7295.0,26.0,3017.0,15.0,2.614284,1.0,1.0,0.997819,0.998037,0.996771,0.998911,0.998767


In [6]:
#first split the dataset into training and test sets
x = data.iloc[:,1:]
y = data.iloc[:,:1]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 4222)


x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size = 0.25, random_state = 4222)


#balance x_train with oversampling
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
# oversample = RandomOverSampler(sampling_strategy = 1)
x_train,y_train = undersample.fit_resample(x_train, y_train)
data = pd.concat([x_train,y_train],axis = 1)

#check that train set is oversampled
data['class'].value_counts()

0    10477
1    10477
Name: class, dtype: int64

In [7]:
x_train

Unnamed: 0,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
0,90,10,2,1,5.181818,5.100000,14,3,33,2,-0.476010,0.120833,-0.083333,0.007685,0.501388,0.480050,0.005723,0.005154
1,81,9,2,1,5.769231,6.888889,10,3,24,1,-0.762253,0.295833,-0.037500,0.878409,0.004836,0.003309,0.108512,0.004934
2,576,9,19,1,5.054250,6.555556,59,2,221,1,0.093578,0.317672,0.040250,0.421561,0.299320,0.000958,0.001563,0.276598
3,272,10,12,1,4.970480,5.200000,25,1,106,3,0.311116,0.180072,0.029348,0.398165,0.002402,0.344981,0.002684,0.251768
4,179,8,7,1,4.994186,6.000000,21,0,60,1,0.231496,0.131944,-0.070833,0.064624,0.003595,0.002468,0.003989,0.925325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20949,565,16,10,1,4.742857,5.533333,46,1,256,0,-0.267928,0.368203,0.122754,0.002057,0.364870,0.000975,0.630648,0.001450
20950,1809,32,144,1,4.418500,4.875000,307,3,772,9,0.852104,0.467778,-0.001175,0.015731,0.304634,0.000364,0.000592,0.678679
20951,418,24,15,2,4.597087,4.500000,37,7,190,7,0.432620,0.487291,0.011708,0.002322,0.914441,0.062097,0.019515,0.001625
20952,677,14,25,1,4.559398,5.538462,86,2,281,0,0.490826,0.482386,0.237027,0.001734,0.258729,0.274193,0.419005,0.046338


In [18]:
# Since we will be performing SVC, and SVC works better on scaled data, we will be scaling all our to ensure that the model runs smoothly
# We will use the Min Max scaler

scaler = StandardScaler()

#
cols = data.iloc[:,0:-1].columns

# Normalize the data , since the data is not normally distributed we will use minmaxscaler
x_train[cols] = scaler.fit_transform(x_train[cols]) 

x_test[cols] = scaler.transform(x_test[cols])
x_validation[cols] = scaler.transform(x_validation[cols])



Index(['text_word_count', 'title_word_count', 'text_sentence_count',
       'title_sentence_count', 'text_average_word_length',
       'title_average_word_length', 'text_punctuation_count',
       'title_punctuation_count', 'text_stopwords_count',
       'title_stopwords_count', 'flesch_readability', 'subjectivity',
       'polarity'],
      dtype='object')

In [9]:
x_train.describe()

Unnamed: 0,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
count,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0
mean,-2.72358e-16,-1.469626e-14,-1.049149e-15,1.318155e-14,3.506602e-16,-5.883907e-16,5.669958e-16,-1.152284e-14,-1.282103e-16,-5.718273e-14,4.1647930000000005e-17,-8.679598e-16,2.170058e-16,-8.458761e-16,-2.004616e-16,-2.291153e-16,4.957219e-16,4.741549e-16
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024
min,-1.262791,-2.931693,-1.177856,-0.2603761,-1.58302,-1.631343,-0.7889646,-1.18951,-1.253342,-0.8174526,-81.44533,-3.249188,-9.880134,-0.6507792,-1.081593,-0.5953841,-0.7061261,-0.7530992
25%,-0.576049,-0.6210811,-0.6745617,-0.2603761,-0.1488547,-0.4064443,-0.4056151,-0.499258,-0.5973439,-0.8174526,-0.1875058,-0.5300807,-0.5370229,-0.6415922,-0.9941468,-0.5899856,-0.6954905,-0.7409203
50%,-0.1131114,-0.1076119,-0.1712675,-0.2603761,-0.03330501,-0.05647327,-0.125475,0.1909935,-0.1093455,-0.05603271,0.04384685,0.02645817,-0.02007071,-0.5659798,-0.252653,-0.5597992,-0.4992014,-0.4922527
75%,0.3314313,0.6625919,0.415909,-0.2603761,0.07729046,0.2934978,0.2136419,0.8812451,0.362653,0.7053872,0.2718537,0.5781048,0.480972,0.2499913,0.8508144,0.1583353,0.3209549,0.4319645
max,24.59733,8.36463,25.6645,10.38186,77.06481,97.58545,106.7701,11.92527,22.88258,10.60385,2.427969,4.7284,8.806089,2.985158,2.093628,4.195803,3.241261,3.465194


Linear SVM

In [10]:
# Linear SVM baseline model

linearSVC = SVC(kernel='linear',random_state=4222)
linearSVC.fit(x_train, np.ravel(y_train))
y_predval_linearSVC = linearSVC.predict(x_validation)

#validation metrics
print("Performance on Validation set:")
print("Accuracy:", metrics.accuracy_score(y_validation, y_predval_linearSVC))
print("Precision:", metrics.precision_score(y_validation, y_predval_linearSVC))
print("Recall:",metrics.recall_score(y_validation, y_predval_linearSVC))
print("F1_score:", metrics.f1_score(y_validation, y_predval_linearSVC))
print("-------------------------------")


#test metrics
y_pred_linearSVC = linearSVC.predict(x_test)
print("Performance on  Test set:")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_linearSVC))
print("Precision:", metrics.precision_score(y_test, y_pred_linearSVC))
print("Recall:",metrics.recall_score(y_test, y_pred_linearSVC))
print("F1_score:", metrics.f1_score(y_test, y_pred_linearSVC))


Performance on Validation set:
Accuracy: 0.9644335230212105
Precision: 0.9687773562882988
Recall: 0.9518348623853211
F1_score: 0.9602313810556761
-------------------------------
Performance on  Test set:
Accuracy: 0.9593895499224004
Precision: 0.9663053032522707
Recall: 0.943094080640549
F1_score: 0.9545586107091172


In [11]:
from sklearn.feature_selection import RFE


# Recursive Feature Selection

svc_lin=SVC(kernel='linear',random_state=4222)
svm_rfe_model=RFE(estimator=svc_lin)
svm_rfe_model_fit=svm_rfe_model.fit(x_train,np.ravel(y_train))
feat_index = pd.Series(data = svm_rfe_model_fit.ranking_, index = x_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

print('Significant features from RFE',signi_feat_rfe)


Significant features from RFE Index(['text_word_count', 'title_word_count', 'text_sentence_count',
       'text_average_word_length', 'text_punctuation_count',
       'text_stopwords_count', 'title_stopwords_count', 'flesch_readability',
       'Topic 1 Probability'],
      dtype='object')


In [12]:
# New Data after feature selection
x_train_new = x_train[signi_feat_rfe]
x_validation_new = x_validation[signi_feat_rfe]
x_test_new = x_test[signi_feat_rfe]



In [13]:
#hyperparameter tuning with gridsearch for SVM  

grid_params = {
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001]
    }

scorer = metrics.make_scorer(metrics.f1_score)

gridCV = GridSearchCV(SVC(kernel='linear',random_state = 4222), param_grid = grid_params, cv = 5, scoring = scorer, n_jobs=-1)

gridCV.fit(x_train_new,np.ravel(y_train))

print("Best Hyper Parameters: ", gridCV.best_params_)

Best Hyper Parameters:  {'C': 1, 'gamma': 1}


In [19]:
finalSVC = SVC( kernel= 'linear', C= 1, gamma= 1, random_state = 4222)
finalSVC.fit(x_train_new, np.ravel(y_train))
y_predval_finalSVC = finalSVC.predict(x_validation_new)


#validation metrics
print("Performance on Validation set:")
print("Accuracy:", metrics.accuracy_score(y_validation, y_predval_finalSVC))
print("Precision:", metrics.precision_score(y_validation, y_predval_finalSVC))
print("Recall:",metrics.recall_score(y_validation, y_predval_finalSVC))
print("F1_score:", metrics.f1_score(y_validation, y_predval_finalSVC))
print("-------------------------------")


#test metrics
y_predtest_linearSVC = finalSVC.predict(x_test_new)
print("Performance on  Test set:")
print("Accuracy:", metrics.accuracy_score(y_test, y_predtest_linearSVC))
print("Precision:", metrics.precision_score(y_test, y_predtest_linearSVC))
print("Recall:",metrics.recall_score(y_test, y_predtest_linearSVC))
print("F1_score:", metrics.f1_score(y_test, y_predtest_linearSVC))


Performance on Validation set:
Accuracy: 0.9580962234868081
Precision: 0.9666666666666667
Recall: 0.939506880733945
F1_score: 0.9528932829310846
-------------------------------
Performance on  Test set:
Accuracy: 0.9502069322296948
Precision: 0.9600827912477824
Recall: 0.9285101515584787
F1_score: 0.9440325628725105
