# Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV


# Import data

In [2]:
# Import Preprocessed data
data = pd.read_csv("data/preprocess_data_1.csv")

# Import Bert Features
bert_features = pd.read_csv("data/bertFeatures_1.csv")
bert_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,37.146960,-0.718577,-73.298584,-67.357710,-53.100407,21.316463,41.753353,32.956703,0.400308,-6.732258,...,-42.584038,-44.468742,-58.364820,53.291164,4.001782,54.231130,65.188090,84.791145,41.844980,53.529175
1,33.885796,13.979010,-85.719350,-30.451246,-22.090153,4.564034,31.165920,46.367527,-45.197884,-4.584962,...,-70.876810,-29.125689,-56.661724,68.347630,-14.737951,47.097120,39.724976,132.571990,53.459076,32.982380
2,16.787160,29.406092,-48.264774,-38.957745,-64.305210,38.198917,5.805443,29.524828,-17.051638,-50.592262,...,-86.319290,-12.413198,-68.498245,68.740620,15.706626,7.832871,24.119993,63.330020,-2.067029,37.960247
3,35.850006,42.962036,-75.509800,-48.785090,-41.254383,-1.976429,-6.982839,15.469737,-31.976854,-40.112244,...,-71.183754,-40.468624,-63.892910,65.396100,-26.214056,62.399140,22.835870,70.418625,57.963970,40.734090
4,29.469852,34.854473,-71.579410,-56.628280,-25.895311,10.295429,3.647528,6.661787,-56.010277,-16.745693,...,-95.782936,-47.623943,-66.134270,49.544070,-15.663713,82.616540,51.079155,34.097990,34.966217,50.609196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3097,40.814007,52.365460,-66.301216,-5.888998,-25.481980,12.881906,-20.610170,29.777680,-70.850876,-23.516727,...,-95.068090,-3.134943,-21.826483,52.187405,4.000219,-8.243583,61.727478,65.101850,13.242323,-11.598402
3098,79.484070,14.683929,-55.400757,-36.410700,-51.086018,22.655785,23.772587,18.880114,-26.694736,-28.209026,...,-95.518690,-6.570502,-43.297688,36.865738,-26.593466,-11.854276,15.762242,30.530457,-21.491936,6.241181
3099,34.106094,46.009880,-107.840490,-21.282597,-41.868920,19.889317,27.115509,53.853207,-42.030037,-11.185226,...,-100.019050,-14.799921,-43.575123,80.095276,-46.723160,51.911724,5.770129,22.068426,4.152709,21.077639
3100,31.388390,6.374999,-67.875260,-34.819275,-34.774790,49.245262,7.877729,72.808310,-33.193504,-47.335880,...,-97.020836,-23.411839,-46.369392,119.432790,-8.986543,49.455700,-7.393787,114.400400,29.348710,32.622130


In [3]:
# Define X and y
X = bert_features
y = data['genre']

# Split data into Train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Classifiers

### Random Forest Classifier

In [4]:
# Create the parameter grid based on the results of random search 
param_grid_rfc = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a base model
rf = RandomForestClassifier()

# Instantiate the grid search model
gridSearchRF = GridSearchCV(estimator = rf,
                           param_grid = param_grid_rfc, 
                           cv = 3,
                           n_jobs = -1,
                           verbose = 2)

# Fit the grid search to the data
gridSearchRF.fit(X_train, y_train)

# Best parameters from the grid
gridSearchRF.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


{'bootstrap': True,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 12,
 'n_estimators': 1000}

In [5]:
# Best Grid Model
bestGridModelRF = gridSearchRF.best_estimator_

# Predict the response for test dataset
y_pred_RF = bestGridModelRF.predict(X_test)

# Model Accuracy
print("Accuracy from Random Forest Classifier:", accuracy_score(y_test, y_pred_RF))

# Print Classification Report
print(classification_report(y_test, y_pred_RF))

Accuracy from Random Forest Classifier: 0.6100966702470462
              precision    recall  f1-score   support

       crime       0.83      0.05      0.09       106
     fantasy       0.61      0.83      0.70       193
     history       0.73      0.67      0.70       139
      horror       0.89      0.23      0.36       140
     science       0.82      0.60      0.69       142
    thriller       0.49      0.91      0.64       211

    accuracy                           0.61       931
   macro avg       0.73      0.55      0.53       931
weighted avg       0.70      0.61      0.56       931



### MLP Classifier

In [6]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

# Create the base model
mlpc = MLPClassifier(random_state=1, max_iter=300)

# Grid Search
gridSearchMLP = GridSearchCV(estimator = mlpc,
                             param_grid = param_grid_mlp,
                             n_jobs=-1,
                             cv=10)

# Fit the grid search to the data
gridSearchMLP.fit(X_train, y_train)

# Best parameters from the grid
gridSearchMLP.best_params_

{'activation': 'tanh',
 'alpha': 0.05,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'solver': 'adam'}

In [7]:
# Best Grid Model
bestGridModelMLP = gridSearchMLP.best_estimator_

# Predict the response for test dataset
y_pred_MLP = bestGridModelMLP.predict(X_test)

# Model Accuracy
print("Accuracy from MLP Classifier:", accuracy_score(y_test, y_pred_MLP))

# Print Classification Report
print(classification_report(y_test, y_pred_MLP))

Accuracy from MLP Classifier: 0.6659505907626209
              precision    recall  f1-score   support

       crime       0.65      0.49      0.56       106
     fantasy       0.68      0.70      0.69       193
     history       0.70      0.72      0.71       139
      horror       0.53      0.67      0.59       140
     science       0.74      0.70      0.72       142
    thriller       0.70      0.65      0.68       211

    accuracy                           0.67       931
   macro avg       0.67      0.66      0.66       931
weighted avg       0.67      0.67      0.67       931



### Support Vector Classifier

In [8]:
# defining parameter range
param_grid_svc = {'C': [0.1, 1, 10, 100, 1000], 
              'kernel': ['linear','rbf']}

# Create sv classifer object
svc = SVC()
  
gridSearchSVC = GridSearchCV(estimator = svc,
                    param_grid = param_grid_svc,
                    refit = True,
                    verbose = 3,
                    cv = 3)

# Fit the grid search to the data
gridSearchSVC.fit(X_train, y_train)

# Best parameters from the grid
gridSearchSVC.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.659 total time=   0.2s
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.620 total time=   0.2s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.607 total time=   0.2s
[CV 1/3] END .................C=0.1, kernel=rbf;, score=0.452 total time=   0.6s
[CV 2/3] END .................C=0.1, kernel=rbf;, score=0.454 total time=   0.5s
[CV 3/3] END .................C=0.1, kernel=rbf;, score=0.456 total time=   0.5s
[CV 1/3] END ................C=1, kernel=linear;, score=0.659 total time=   0.2s
[CV 2/3] END ................C=1, kernel=linear;, score=0.620 total time=   0.2s
[CV 3/3] END ................C=1, kernel=linear;, score=0.607 total time=   0.2s
[CV 1/3] END ...................C=1, kernel=rbf;, score=0.692 total time=   0.4s
[CV 2/3] END ...................C=1, kernel=rbf;, score=0.691 total time=   0.4s
[CV 3/3] END ...................C=1, kernel=rbf;

{'C': 1, 'kernel': 'rbf'}

In [9]:
# Best Grid Model
bestGridModelSVC = gridSearchSVC.best_estimator_

# Predict the response for test dataset
y_pred_SVC = bestGridModelSVC.predict(X_test)

# Model Accuracy
print("Accuracy from SVC Classifier:", accuracy_score(y_test, y_pred_SVC))

# Print Classification Report
print(classification_report(y_test, y_pred_SVC))

Accuracy from SVC Classifier: 0.6981740064446831
              precision    recall  f1-score   support

       crime       0.68      0.38      0.48       106
     fantasy       0.72      0.77      0.74       193
     history       0.72      0.78      0.75       139
      horror       0.77      0.57      0.66       140
     science       0.79      0.74      0.76       142
    thriller       0.61      0.79      0.69       211

    accuracy                           0.70       931
   macro avg       0.71      0.67      0.68       931
weighted avg       0.71      0.70      0.69       931



### Adaboost Classifier

In [10]:
# Create svc object
svc = SVC(probability=True, kernel='rbf')

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=500, base_estimator=svc,learning_rate=1)

# Train Adaboost Classifer
modelABC = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_abc = modelABC.predict(X_test)

# Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_abc))

# Print Classification Report
print(classification_report(y_test, y_pred_abc))

Accuracy: 0.6240601503759399
              precision    recall  f1-score   support

       crime       0.59      0.37      0.45       106
     fantasy       0.59      0.73      0.65       193
     history       0.70      0.76      0.72       139
      horror       0.74      0.35      0.48       140
     science       0.79      0.61      0.69       142
    thriller       0.53      0.76      0.63       211

    accuracy                           0.62       931
   macro avg       0.66      0.60      0.60       931
weighted avg       0.65      0.62      0.61       931



### SGD Classifier

In [11]:
# Create sgd classifier object
sgdc = SGDClassifier(max_iter=1000, tol=0.01) 

# Train SGD Classifier
modelSGD = sgdc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_sgd = modelSGD.predict(X_test)

# Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))

# Print Classification Report
print(classification_report(y_test, y_pred_sgd))

Accuracy: 0.6369495166487648
              precision    recall  f1-score   support

       crime       0.47      0.76      0.58       106
     fantasy       0.56      0.82      0.67       193
     history       0.73      0.74      0.74       139
      horror       0.84      0.36      0.51       140
     science       0.68      0.61      0.64       142
    thriller       0.78      0.54      0.64       211

    accuracy                           0.64       931
   macro avg       0.68      0.64      0.63       931
weighted avg       0.68      0.64      0.63       931

