In [1]:
# pip install imblearn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE 
import imblearn
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data = pd.read_csv("processed_data.csv")

# seed for random state
seed = 49

In [3]:
#drop unwanted features
data = data.drop(['text','title','text_without_stopwords', 'title_without_stopwords','syllables', 'polarity_category', 'overall_content', 'polarity'], axis=1)
x = data.iloc[:,1:]
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = seed)

# undersampling to make up for slightly imbalanced data
sample = RandomUnderSampler(sampling_strategy='majority')
x_train,y_train = sample.fit_resample(x_train, y_train)

In [4]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state = seed)
gb.fit(x_train, y_train)
ypred_gb = gb.predict(x_test)

accuracy_gb = metrics.accuracy_score(y_test, ypred_gb) 
precision_gb = metrics.precision_score(y_test, ypred_gb)
recall_gb = metrics.recall_score(y_test, ypred_gb)
f1_score_gb = metrics.f1_score(y_test, ypred_gb) 

print("Gradient Boosting Accuracy:", accuracy_gb)
print("Gradient Boosting Precision:", precision_gb)
print("Gradient Boosting Recall:", recall_gb)
print("Gradient Boosting F1_score:", f1_score_gb)

Gradient Boosting Accuracy: 0.9769787894464563
Gradient Boosting Precision: 0.9809847878302642
Gradient Boosting Recall: 0.9660950128129312
Gradient Boosting F1_score: 0.9734829675240839


In [10]:
# GB Hyperparameter Tuning 
gb_tuning = GradientBoostingClassifier(random_state = seed)
grid_params = {
   'n_estimators': [100,200,300,350],
   'max_depth' : [4,5,6,7],
   'learning_rate' : [0.2,0.3,0.4]}
gridCV = GridSearchCV(estimator=gb_tuning, param_grid=grid_params)
gridCV.fit(x_train, y_train)
gridCV.best_params_ 

{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 350}

In [11]:
# Gradient Boosting with tuned hyperparameters
gb = GradientBoostingClassifier(random_state = seed, n_estimators=350, max_depth=4, learning_rate=0.2)
gb.fit(x_train, y_train)
ypred_gb = gb.predict(x_test)

accuracy_gb = metrics.accuracy_score(y_test, ypred_gb) 
precision_gb = metrics.precision_score(y_test, ypred_gb)
recall_gb = metrics.recall_score(y_test, ypred_gb)
f1_score_gb = metrics.f1_score(y_test, ypred_gb) 

print("Gradient Boosting Accuracy:", accuracy_gb)
print("Gradient Boosting Precision:", precision_gb)
print("Gradient Boosting Recall:", recall_gb)
print("Gradient Boosting F1_score:", f1_score_gb)

Gradient Boosting Accuracy: 0.9812898775650974
Gradient Boosting Precision: 0.98328025477707
Gradient Boosting Recall: 0.9737827715355806
Gradient Boosting F1_score: 0.9785084678617412


In [4]:
# Random Forest 
rf = RandomForestClassifier(random_state=seed)
rf.fit(x_train, y_train)
ypred_rf = rf.predict(x_test)

accuracy_rf = metrics.accuracy_score(y_test, ypred_rf) 
precision_rf = metrics.precision_score(y_test, ypred_rf)
recall_rf = metrics.recall_score(y_test, ypred_rf)
f1_score_rf = metrics.f1_score(y_test, ypred_rf) 

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Precision:", precision_rf)
print("Random Forest Recall:", recall_rf)
print("Random Forest F1_score:", f1_score_rf)

Random Forest Accuracy: 0.9755992412484911
Random Forest Precision: 0.9776625448743518
Random Forest Recall: 0.9662921348314607
Random Forest F1_score: 0.9719440864479033


In [8]:
# RF Hyperparameter Tuning 
rf_tuning = RandomForestClassifier(random_state = seed)
grid_params = {
   'n_estimators': [100,200,300,350],
   'max_features': ['auto', None],
   'max_depth' : [5,6,7,8]}
gridCV = GridSearchCV(estimator=rf_tuning, param_grid=grid_params, cv=5)
gridCV.fit(x_train, y_train)
gridCV.best_params_ 

{'max_depth': 8, 'max_features': 'auto', 'n_estimators': 300}

In [12]:
# Random Forest with tuned hyperparameters
rf = RandomForestClassifier(random_state=seed, n_estimators=300, max_features='auto', max_depth=8)
rf.fit(x_train, y_train)
ypred_rf = rf.predict(x_test)

accuracy_rf = metrics.accuracy_score(y_test, ypred_rf) 
precision_rf = metrics.precision_score(y_test, ypred_rf)
recall_rf = metrics.recall_score(y_test, ypred_rf)
f1_score_rf = metrics.f1_score(y_test, ypred_rf) 

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Precision:", precision_rf)
print("Random Forest Recall:", recall_rf)
print("Random Forest F1_score:", f1_score_rf)

Random Forest Accuracy: 0.9730988101396792
Random Forest Precision: 0.9771497294046904
Random Forest Recall: 0.960969840331165
Random Forest F1_score: 0.9689922480620156
