In [81]:
import pandas as pd
data = pd.read_csv("master_dataset/processed_data.csv")
#list(data.columns)
#drop unwanted features
data = data.drop(['title', 'text', 'subject', 'date', 'text_without_stopwords', 'title_without_stopwords','syllables', 'polarity_category', 'overall_content', 'polarity'], axis=1)
#'Topic 1 Probability', 'Topic 2 Probability', 'Topic 3 Probbility' , 'Topic 4 Probability' ,'Topic 5 Probability',
#'title_word_count', 'title_sentence_count', 'title_average_word_length','title_punctuation_count', 'title_stopwords_count'  

In [78]:
#dataset is slightly imbalanced so we will perform upsampling to balance the dataset.
data['class'].value_counts()

0    21196
1    17462
Name: class, dtype: int64

In [69]:
list(data.columns)

['class',
 'text_word_count',
 'title_word_count',
 'text_sentence_count',
 'title_sentence_count',
 'text_average_word_length',
 'title_average_word_length',
 'text_punctuation_count',
 'title_punctuation_count',
 'text_stopwords_count',
 'title_stopwords_count',
 'flesch_readability',
 'subjectivity',
 'Topic 1 Probability',
 'Topic 2 Probability',
 'Topic 3 Probbility',
 'Topic 4 Probability',
 'Topic 5 Probability',
 'polarity_category_Neutral',
 'polarity_category_Positive']

In [83]:
#first split the dataset into training and test sets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
x = data.iloc[:,1:]
y = data.iloc[:,:1]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state = 1)

#balance x_train with oversampling
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy = 1)
x_train,y_train = oversample.fit_resample(x_train, y_train)
data = pd.concat([x_train,y_train],axis = 1)

#check that train set is oversampled
data['class'].value_counts()

#Ensemble methods such as XGBoost and AdaBoost do not require feature scaling.
type(y_train)

pandas.core.frame.DataFrame

AdaBoost

In [85]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
import numpy as np

#Ada Boost baseline model
# we will use unscaled x_train and x_test sets here.

ada_boost = AdaBoostClassifier( random_state = 1)
ada_boost.fit(x_train, np.ravel(y_train))
y_pred_ada_boost = ada_boost.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_ada_boost))
print("Precision:", metrics.precision_score(y_test, y_pred_ada_boost))
print("Recall:",metrics.recall_score(y_test, y_pred_ada_boost))
print("F1_score:", metrics.f1_score(y_test, y_pred_ada_boost))


Accuracy: 0.9686152784962925
Precision: 0.9643059490084985
Recall: 0.9668623366786594
F1_score: 0.9655824508320726


In [72]:
#hyperparameter tuning with gridsearch for Ada Boost
from sklearn.model_selection import GridSearchCV

grid_params = {
    'n_estimators': [50, 100, 200,500,1000],
    'learning_rate': [0.01, 0.1, 0.5, 1, 10],
    'algorithm': ['SAMME', 'SAMME.R'],
}
scorer = metrics.make_scorer(metrics.f1_score)

gridCV = GridSearchCV(AdaBoostClassifier(random_state = 1), param_grid = grid_params, cv = 5, scoring = scorer, n_jobs=-1)
gridCV.fit(x_train,np.ravel(y_train))
print("Best Hyper Parameters: ", gridCV.best_params_)

  y = column_or_1d(y, warn=True)


Best Hyper Parameters:  {'algorithm': 'SAMME.R', 'learning_rate': 1, 'n_estimators': 1000}


In [87]:
ada_boost = AdaBoostClassifier( algorithm= 'SAMME.R', learning_rate= 1, n_estimators= 1000, random_state = 1)
ada_boost.fit(x_train, np.ravel(y_train))
y_pred_ada_boost = ada_boost.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_ada_boost))
print("Precision:", metrics.precision_score(y_test, y_pred_ada_boost))
print("Recall:",metrics.recall_score(y_test, y_pred_ada_boost))
print("F1_score:", metrics.f1_score(y_test, y_pred_ada_boost))


Accuracy: 0.9745645801000172
Precision: 0.9752192146397255
Recall: 0.9687559174398788
F1_score: 0.971976821506602
