In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
import numpy as np

In [3]:
data = pd.read_csv("master_dataset/processed_data.csv")
list(data.columns)
# drop unwanted features


data = data.drop(['title', 'text', 'text_without_stopwords', 'title_without_stopwords','syllables', 
                  'polarity_category', 'overall_content', 'polarity_category_Neutral' , 'polarity_category_Positive',
                  'text_average_word_length',  'title_average_word_length',  'text_punctuation_count',
                    'title_punctuation_count',  'text_stopwords_count',  'title_stopwords_count',], axis=1)

#'title_word_count', 'title_sentence_count', 'title_average_word_length','title_punctuation_count', 'title_stopwords_count'  
# 'polarity'

In [4]:
#dataset is slightly imbalanced so we will perform upsampling to balance the dataset.
data['class'].value_counts()

0    21196
1    17462
Name: class, dtype: int64

In [5]:
list(data.columns)

['class',
 'text_word_count',
 'title_word_count',
 'text_sentence_count',
 'title_sentence_count',
 'flesch_readability',
 'subjectivity',
 'polarity',
 'Topic 1 Probability',
 'Topic 2 Probability',
 'Topic 3 Probbility',
 'Topic 4 Probability',
 'Topic 5 Probability']

In [6]:
data.describe()

Unnamed: 0,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
count,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0
mean,0.451705,411.374515,12.161209,15.05083,1.067877,0.006674,0.402753,0.056997,0.190953,0.324903,0.125233,0.18536,0.173552
std,0.497669,322.881353,3.765686,11.985707,0.270277,1.003826,0.124806,0.10517,0.281023,0.312565,0.210573,0.258829,0.234861
min,0.0,1.0,1.0,1.0,1.0,-83.616811,0.0,-1.0,0.000376,0.000246,0.00012,0.000303,0.000268
25%,0.0,220.0,10.0,7.0,1.0,-0.169506,0.337127,0.0,0.002949,0.010969,0.001305,0.003113,0.003024
50%,0.0,376.0,11.0,13.0,1.0,0.050549,0.405745,0.054757,0.030471,0.231605,0.006765,0.056382,0.056373
75%,1.0,522.0,14.0,20.0,1.0,0.267479,0.475,0.108144,0.282926,0.584998,0.157612,0.272065,0.268291
max,1.0,8436.0,45.0,321.0,4.0,2.614284,1.0,1.0,0.997819,0.998037,0.996771,0.998911,0.998767


In [7]:
#first split the dataset into training and test sets
x = data.iloc[:,1:]
y = data.iloc[:,:1]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 4222)


x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size = 0.25, random_state = 4222)


#balance x_train with oversampling
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
# oversample = RandomOverSampler(sampling_strategy = 1)
x_train,y_train = undersample.fit_resample(x_train, y_train)
data = pd.concat([x_train,y_train],axis = 1)

#check that train set is oversampled
data['class'].value_counts()

0    10477
1    10477
Name: class, dtype: int64

In [8]:
# Since we will be performing SVC, and SVC works better on scaled data, we will be scaling all our to ensure that the model runs smoothly
# We will use the Min Max scaler

scaler = StandardScaler()

# Since polarity_category_Neutral and polarity_category_Positive are already from 0 to 1 scale, we will not have to further normalize it.
cols = data.iloc[:,0:-1].columns

# Normalize the data , since the data is not normally distributed we will use minmaxscaler
x_train[cols] = scaler.fit_transform(x_train[cols]) 

x_test[cols] = scaler.transform(x_test[cols])
x_validation[cols] = scaler.transform(x_validation[cols])



In [9]:
x_train.describe()

Unnamed: 0,text_word_count,title_word_count,text_sentence_count,title_sentence_count,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
count,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0,20954.0
mean,-1.485984e-16,6.43725e-14,-1.452869e-16,-3.251644e-15,2.527858e-17,1.047745e-15,3.8421220000000006e-17,2.980446e-16,-7.83999e-16,-9.346081000000001e-17,-7.456413e-17,5.200415e-16
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024
min,-1.265605,-2.931249,-1.181811,-0.2599167,-81.44205,-3.257379,-9.876762,-0.6496807,-1.083736,-0.5955218,-0.706937,-0.7531941
25%,-0.5730816,-0.6205526,-0.6757172,-0.2599167,-0.1882355,-0.5262506,-0.5364375,-0.6405202,-0.9950502,-0.5901131,-0.6963361,-0.7411363
50%,-0.1113991,-0.1070645,-0.1696237,-0.2599167,0.04439102,0.02714473,-0.02145563,-0.5656766,-0.2508576,-0.5576516,-0.4995789,-0.4908026
75%,0.331816,0.6631676,0.4208187,-0.2599167,0.2727143,0.5783038,0.480964,0.2483683,0.8481717,0.1572635,0.3226032,0.4335337
max,24.69634,8.365489,25.80984,10.40515,2.428584,4.736168,8.803887,2.98798,2.095495,4.204596,3.225915,3.478053


Linear SVM

In [10]:
# Linear SVM baseline model

linearSVC = SVC(kernel='linear',random_state=4222)
linearSVC.fit(x_train, np.ravel(y_train))
y_predval_linearSVC = linearSVC.predict(x_validation)

#validation metrics
print("Performance on Validation set:")
print("Accuracy:", metrics.accuracy_score(y_validation, y_predval_linearSVC))
print("Precision:", metrics.precision_score(y_validation, y_predval_linearSVC))
print("Recall:",metrics.recall_score(y_validation, y_predval_linearSVC))
print("F1_score:", metrics.f1_score(y_validation, y_predval_linearSVC))
print("-------------------------------")


#test metrics
y_pred_linearSVC = linearSVC.predict(x_test)
print("Performance on  Test set:")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_linearSVC))
print("Precision:", metrics.precision_score(y_test, y_pred_linearSVC))
print("Recall:",metrics.recall_score(y_test, y_pred_linearSVC))
print("F1_score:", metrics.f1_score(y_test, y_pred_linearSVC))

Performance on Validation set:
Accuracy: 0.911148473874806
Precision: 0.9011744485820682
Recall: 0.9019495412844036
F1_score: 0.9015618283421695
-------------------------------
Performance on  Test set:
Accuracy: 0.9084324883600621
Precision: 0.9022209402942025
Recall: 0.8944809837003146
F1_score: 0.8983342906375646


In [11]:
#hyperparameter tuning with gridsearch for SVM  

grid_params = {
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001]
    }

scorer = metrics.make_scorer(metrics.f1_score)

gridCV = GridSearchCV(SVC(kernel='linear',random_state=4222), param_grid = grid_params, cv = 5, scoring = scorer, n_jobs=-1)

gridCV.fit(x_train,np.ravel(y_train))

print("Best Hyper Parameters: ", gridCV.best_params_)

Best Hyper Parameters:  {'C': 0.1, 'gamma': 1}


In [13]:
finalSVC = SVC( kernel= 'linear', C= 0.1, gamma= 1, random_state = 4222)
finalSVC.fit(x_train, np.ravel(y_train))
y_predval_finalSVC = finalSVC.predict(x_validation)


#validation metrics
print("Performance on Validation set:")
print("Accuracy:", metrics.accuracy_score(y_validation, y_predval_finalSVC))
print("Precision:", metrics.precision_score(y_validation, y_predval_finalSVC))
print("Recall:",metrics.recall_score(y_validation, y_predval_finalSVC))
print("F1_score:", metrics.f1_score(y_validation, y_predval_finalSVC))
print("-------------------------------")


#test metrics
y_predtest_linearSVC = finalSVC.predict(x_test)
print("Performance on  Test set:")
print("Accuracy:", metrics.accuracy_score(y_test, y_predtest_linearSVC))
print("Precision:", metrics.precision_score(y_test, y_predtest_linearSVC))
print("Recall:",metrics.recall_score(y_test, y_predtest_linearSVC))
print("F1_score:", metrics.f1_score(y_test, y_predtest_linearSVC))

Performance on Validation set:
Accuracy: 0.9110191412312467
Precision: 0.9016064257028112
Recall: 0.9010894495412844
F1_score: 0.9013478634929739
-------------------------------
Performance on  Test set:
Accuracy: 0.9081738230729436
Precision: 0.9019325064897606
Recall: 0.8941950243065485
F1_score: 0.8980470993681793
