# Final Model

In [34]:
import os
import pandas as pd
from os import path
import numpy as np
from prettytable import PrettyTable     # https://pypi.org/project/prettytable/

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import randint

# https://www.datacamp.com/tutorial/random-forests-classifier-python
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier

# https://seaborn.pydata.org/
import seaborn as sns
import matplotlib.pyplot as plt

In [35]:
path = os.getcwd()
text_features = pd.read_csv(path + "/book.csv")
text_features_header = list(text_features.columns)

text_features
text_features.shape

(54, 19)

In [36]:
def gen_table(prediction, tests):

    data = []
    i = 0
    book_titles = tests

    for test in book_titles:
        unit = []
        unit.append(text_features.loc[test]['Book Title'])
        unit.append(prediction[i][0])
        unit.append(prediction[i][1])
        data.append(unit)
        i += 1

    table = PrettyTable()
    table.title = 'Test Predictions'
    table.field_names = ['Test Title', 'Min Age', 'Max Age']
    table.align['Test Title'] = 'l'
    
    for row in data:
        table.add_row(row)

    return table

In [51]:
X = text_features.drop(columns=['Unnamed: 0', 'Book Title', 'MIN', 'MAX'])
y = text_features[['MIN', 'MAX']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# scaler here
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [80]:
rf = RandomForestClassifier(n_estimators=274, max_depth=17, random_state=42)

multi_output_rf = MultiOutputClassifier(rf)
multi_output_rf.fit(X_train_scaled, y_train)
y_train_pred = multi_output_rf.predict(X_train_scaled)
rf_pred = multi_output_rf.predict(X_test_scaled)

# tried to do kfold,,, dk if appropriate
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=13)
results = cross_val_score(multi_output_rf, X, y, cv=kf)

# test pred table
print(gen_table(rf_pred, X_test.index))

+-----------------------------------------------------+
|                   Test Predictions                  |
+---------------------------------+---------+---------+
| Test Title                      | Min Age | Max Age |
+---------------------------------+---------+---------+
| Ang Batang Maraming Bawal       |    6    |    12   |
| Ang Ating mga Ninuno - 1        |    3    |    10   |
| Bandilang Pilipino (6-10)       |    6    |    10   |
| Ang Pipit-Puso ni Emperador Wu  |    3    |    10   |
| Plaridel 1                      |    6    |    10   |
| Si Lupito at ang Barrio Sirkero |    8    |    16   |
| Andres Bonifacio - 1            |    6    |    10   |
| Mga Hiyas ng Kalayaan           |    6    |    10   |
| Kilusang Propaganda - 1         |    6    |    10   |
| Ang Unang Barangay (6-10)       |    6    |    10   |
| Ang Dyip ni Mang Tomas          |    8    |    16   |
+---------------------------------+---------+---------+


In [81]:
# PERFORMANCE REPORTS
# PERFORMANCE REPORTS
# PERFORMANCE REPORTS

print('-------------------------------------------------------')
print('Performance Metrics')
print('-------------------------------------------------------')
training_accuracy = accuracy_score(y_train.values.ravel(), y_train_pred.ravel()) * 100
print("Training Accuracy:", training_accuracy)

test_accuracy = accuracy_score(y_test.values.ravel(), rf_pred.ravel()) * 100
print("Test Accuracy:", test_accuracy)

print(f'Cross-Validation Results (Accuracy): ')
fold = 1
for value in results:
    print(f"Fold {fold}: {value}")
    fold += 1

# before HPT 59.272727
# after HPT  62.909090
print(f'Mean Accuracy: {(results.mean()) * 100}')

print('-------------------------------------------------------')

# classification report, MIN
print("Classification Report for MIN:")
print(classification_report(y_test['MIN'], rf_pred[:, 0]))
conf_matrix_min = confusion_matrix(y_test['MIN'], rf_pred[:, 0])
print("Confusion Matrix for MIN:")
print(conf_matrix_min)

print('-------------------------------------------------------')

# classification report, MAX
print("Classification Report for MAX:")
print(classification_report(y_test['MAX'], rf_pred[:, 1]))
conf_matrix_max = confusion_matrix(y_test['MAX'], rf_pred[:, 1])
print("Confusion Matrix for MAX:")
print(conf_matrix_max)

-------------------------------------------------------
Performance Metrics
-------------------------------------------------------
Training Accuracy: 100.0
Test Accuracy: 95.45454545454545
Cross-Validation Results (Accuracy): 
Fold 1: 0.8181818181818182
Fold 2: 0.36363636363636365
Fold 3: 0.7272727272727273
Fold 4: 0.6363636363636364
Fold 5: 0.6
Mean Accuracy: 62.909090909090914
-------------------------------------------------------
Classification Report for MIN:
              precision    recall  f1-score   support

           3       0.50      1.00      0.67         1
           6       1.00      0.88      0.93         8
           8       1.00      1.00      1.00         2

    accuracy                           0.91        11
   macro avg       0.83      0.96      0.87        11
weighted avg       0.95      0.91      0.92        11

Confusion Matrix for MIN:
[[1 0 0]
 [1 7 0]
 [0 0 2]]
-------------------------------------------------------
Classification Report for MAX:
        

In [70]:
# HYPERPARAMETER TUNING
# HYPERPARAMETER TUNING
# HYPERPARAMETER TUNING
# https://www.datacamp.com/tutorial/random-forests-classifier-python

# hyperparameter search space
param_dist = {
    'estimator__n_estimators': randint(50, 500),
    'estimator__max_depth': randint(1, 20)
}

base = RandomForestClassifier()
main = MultiOutputClassifier(base)

rand_search = RandomizedSearchCV(
    main,
    param_distributions=param_dist,
    n_iter=7,
    cv=5
)

rand_search.fit(X_train, y_train)

# best model
best_rf = rand_search.best_estimator_

# best hyperparams
print('Best hyperparameters:',  rand_search.best_params_)


Best hyperparameters: {'estimator__max_depth': 17, 'estimator__n_estimators': 274}


In [73]:
rf = RandomForestClassifier(n_estimators=274, max_depth=17, random_state=42)

multi_output_rf = MultiOutputClassifier(rf)
multi_output_rf.fit(X_train_scaled, y_train)

selector = SelectFromModel(multi_output_rf.estimators_[0], threshold='mean')
selector.fit(X_train_scaled, y_train)
best = selector.get_support()[1:]

# feature selection: RF feature importance
feature_importances = []
for var in multi_output_rf.estimators_:
    feature_importances.append(var.feature_importances_)

avg_feature_importance = np.mean(feature_importances, axis=0)

features = text_features.columns[2:-2]
combined = zip(features, avg_feature_importance[1:])
for pair in list(combined):
    print(pair)

('Word Count', 0.06057465744189547)
('Sentence Count', 0.044502962783569175)
('AVG Word Length', 0.08769177232772082)
('AVG Sentence Length', 0.09159809780530653)
('Total Syllables', 0.07532602165374805)
('MONOSYLL', 0.06753635581557664)
('POLYSYLL', 0.044765539764439366)
('NTR', 0.0652408312424354)
('VTR', 0.06897256645015494)
('TTR', 0.07586138734377075)
('Root TTR', 0.07256281736398726)
('Corrected TTR', 0.07480755090540973)
('BiLog TTR', 0.03675402965015751)
('LD', 0.05597619705626397)
