In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
import numpy as np

In [2]:
data = pd.read_csv("master_dataset/processed_data.csv")
list(data.columns)
# drop unwanted features


data = data.drop(['title', 'text', 'text_without_stopwords', 'title_without_stopwords','syllables', 
                  'polarity_category', 'overall_content', 'polarity_category_Neutral' , 'polarity_category_Positive',
                  'Topic 1 Probability', 'Topic 2 Probability', 'Topic 3 Probbility' , 'Topic 4 Probability' ,'Topic 5 Probability'], axis=1)

#'title_word_count', 'title_sentence_count', 'title_average_word_length','title_punctuation_count', 'title_stopwords_count'  
# 'polarity'

In [3]:
#dataset is slightly imbalanced so we will perform upsampling to balance the dataset.
data['class'].value_counts()

0    21196
1    17462
Name: class, dtype: int64

In [4]:
list(data.columns)

['class',
 'text_word_count',
 'title_word_count',
 'text_sentence_count',
 'title_sentence_count',
 'text_average_word_length',
 'title_average_word_length',
 'text_punctuation_count',
 'title_punctuation_count',
 'text_stopwords_count',
 'title_stopwords_count',
 'flesch_readability',
 'subjectivity',
 'polarity']

In [5]:
data.describe()

Unnamed: 0,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,flesch_readability,subjectivity,polarity
count,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0
mean,0.451705,411.374515,12.161209,15.05083,1.067877,5.138347,5.592058,53.119147,1.715635,155.440685,1.136789,0.006674,0.402753,0.056997
std,0.497669,322.881353,3.765686,11.985707,0.270277,1.875575,1.430398,63.542364,1.452448,123.275458,1.295738,1.003826,0.124806,0.10517
min,0.0,1.0,1.0,1.0,1.0,2.25,3.0,0.0,0.0,0.0,0.0,-83.616811,0.0,-1.0
25%,0.0,220.0,10.0,7.0,1.0,4.866522,5.0,26.0,1.0,80.0,0.0,-0.169506,0.337127,0.0
50%,0.0,376.0,11.0,13.0,1.0,5.075727,5.5,45.0,2.0,142.0,1.0,0.050549,0.405745,0.054757
75%,1.0,522.0,14.0,20.0,1.0,5.272987,6.066667,67.0,3.0,201.0,2.0,0.267479,0.475,0.108144
max,1.0,8436.0,45.0,321.0,4.0,149.0,149.0,7295.0,26.0,3017.0,15.0,2.614284,1.0,1.0


In [9]:
# Since we will be performing SVC, and SVC works better on scaled data, we will be scaling all our to ensure that the model runs smoothly
# We will use the Min Max scaler

min_max_scaler = MinMaxScaler()


cols = data.iloc[:,1:].columns

#cols = ['text_word_count','title_word_count','text_sentence_count','title_sentence_count','text_average_word_length',
#        'title_average_word_length','text_punctuation_count','title_punctuation_count',
#        'text_stopwords_count','title_stopwords_count','flesch_readability']

# Normalize the data , since the data is not normally distributed we will use minmaxscaler
data[cols] = min_max_scaler.fit_transform(data[cols]) 



In [10]:
data.describe()

Unnamed: 0,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,flesch_readability,subjectivity,polarity
count,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0,38658.0
mean,0.451705,0.048651,0.253664,0.043909,0.022626,0.019682,0.017754,0.007282,0.065986,0.051522,0.075786,0.96976,0.402753,0.528498
std,0.497669,0.038279,0.085584,0.037455,0.090092,0.012781,0.009797,0.00871,0.055863,0.04086,0.086383,0.011641,0.124806,0.052585
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.025963,0.204545,0.01875,0.0,0.01783,0.013699,0.003564,0.038462,0.026516,0.0,0.967717,0.337127,0.5
50%,0.0,0.044458,0.227273,0.0375,0.0,0.019255,0.017123,0.006169,0.076923,0.047067,0.066667,0.970269,0.405745,0.527378
75%,1.0,0.061766,0.295455,0.059375,0.0,0.0206,0.021005,0.009184,0.115385,0.066622,0.133333,0.972785,0.475,0.554072
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
#first split the dataset into training and test sets
x = data.iloc[:,1:]
y = data.iloc[:,:1]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state = 1)

#balance x_train with oversampling
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy = 1)
x_train,y_train = oversample.fit_resample(x_train, y_train)
data = pd.concat([x_train,y_train],axis = 1)

#check that train set is oversampled
data['class'].value_counts()


1    14879
0    14879
Name: class, dtype: int64

Linear SVM

In [12]:
# Linear SVM baseline model

linearSVC = SVC(kernel='linear',random_state=1)
linearSVC.fit(x_train, np.ravel(y_train))
y_pred_linearSVC = linearSVC.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_linearSVC))
print("Precision:", metrics.precision_score(y_test, y_pred_linearSVC))
print("Recall:",metrics.recall_score(y_test, y_pred_linearSVC))
print("F1_score:", metrics.f1_score(y_test, y_pred_linearSVC))


Accuracy: 0.9493016037247801
Precision: 0.9712793733681462
Recall: 0.9157356561257337
F1_score: 0.9426900584795321


In [13]:
#hyperparameter tuning with gridsearch for SVM  

grid_params = {
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001]
    }

scorer = metrics.make_scorer(metrics.f1_score)

gridCV = GridSearchCV(SVC(kernel='linear',random_state=1), param_grid = grid_params, cv = 5, scoring = scorer, n_jobs=-1)

gridCV.fit(x_train,np.ravel(y_train))

print("Best Hyper Parameters: ", gridCV.best_params_)

Best Hyper Parameters:  {'C': 100, 'gamma': 1}


In [14]:
finalSVC = SVC( kernel= 'linear', C= 100, gamma= 1, random_state = 1)
finalSVC.fit(x_train, np.ravel(y_train))
y_pred_finalSVC = finalSVC.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_finalSVC))
print("Precision:", metrics.precision_score(y_test, y_pred_finalSVC))
print("Recall:",metrics.recall_score(y_test, y_pred_finalSVC))
print("F1_score:", metrics.f1_score(y_test, y_pred_finalSVC))


Accuracy: 0.9552509053285049
Precision: 0.9716719492868463
Recall: 0.9288013633781481
F1_score: 0.949753122277084
