In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE 

In [2]:
data = pd.read_csv("processed_data.csv")

# seed for random state
seed = 49

In [3]:
#drop unwanted features
data = data.drop(['title', 'text', 'text_without_stopwords', 'title_without_stopwords','syllables', 'polarity_category', 'overall_content', 'polarity'], axis=1)
x = data.iloc[:,1:]
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = seed)

In [4]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state = seed)
gb.fit(x_train, y_train)
ypred_gb = gb.predict(x_test)

accuracy_gb = metrics.accuracy_score(y_test, ypred_gb) 
precision_gb = metrics.precision_score(y_test, ypred_gb)
recall_gb = metrics.recall_score(y_test, ypred_gb)
f1_score_gb = metrics.f1_score(y_test, ypred_gb) 

print("GB Accuracy:", accuracy_gb)
print("GB Precision:", precision_gb)
print("GB Recall:", recall_gb)
print("GB F1_score:", f1_score_gb)

GB Accuracy: 0.9773236764959475
GB Precision: 0.9840982286634461
GB Recall: 0.9637295485905776
GB F1_score: 0.9738073897022209


In [9]:
# GB Hyperparameter Tuning 
gb_tuning = GradientBoostingClassifier(random_state = seed)
grid_params = {
   'n_estimators': [50,100,150],
   'max_depth' : [3,4,5],
   'learning_rate' : [0.2,0.3,0.4]}
gridCV = GridSearchCV(estimator=gb_tuning, param_grid=grid_params)
gridCV.fit(x_train, y_train)
gridCV.best_params_ 

{'learning_rate': 0.3, 'max_depth': 4, 'n_estimators': 150}

In [11]:
# Gradient Boosting with tuned hyperparameters
gb = GradientBoostingClassifier(random_state = seed, n_estimators=150, max_depth=4, learning_rate=0.3)
gb.fit(x_train, y_train)
ypred_gb = gb.predict(x_test)

accuracy_gb = metrics.accuracy_score(y_test, ypred_gb) 
precision_gb = metrics.precision_score(y_test, ypred_gb)
recall_gb = metrics.recall_score(y_test, ypred_gb)
f1_score_gb = metrics.f1_score(y_test, ypred_gb) 

print("GB Accuracy:", accuracy_gb)
print("GB Precision:", precision_gb)
print("GB Recall:", recall_gb)
print("GB F1_score:", f1_score_gb)

GB Accuracy: 0.9799103293671323
GB Precision: 0.9818797291915572
GB Recall: 0.9720086733688152
GB F1_score: 0.9769192669638435


In [34]:
# Random Forest 
rf = RandomForestRegressor(random_state=seed)
rf.fit(x_train, y_train)
ypred_rf = rf.predict(x_test)

rmse_rf = MSE(y_test, ypred_rf)**0.5
print("RF rmse:", rmse_rf)

0.1423945624515253

In [16]:
# RF Hyperparameter Tuning 
rf_tuning = RandomForestRegressor(random_state = seed)
grid_params = {
   'n_estimators': [100,150,200],
   'max_features': ['auto', None],
   'max_depth' : [6,7,8]}
gridCV = GridSearchCV(estimator=rf_tuning, param_grid=grid_params, cv=5)
gridCV.fit(x_train, y_train)
gridCV.best_params_ 

{'max_depth': 8, 'max_features': 'auto', 'n_estimators': 150}

In [17]:
# Random Forest with tuned hyperparameters
rf = RandomForestRegressor(random_state=seed, n_estimators=150, max_features='auto', max_depth=8)
rf.fit(x_train, y_train)
ypred_rf = rf.predict(x_test)

rmse_rf = MSE(y_test, ypred_rf)**0.5
print("RF rmse:", rmse_rf)

RF rmse: 0.15406777979850128
