# `sklearn` + Word2Vec

Book Genre Prediction using PyTorch and Word2Vec. Data available at [here](https://www.kaggle.com/datasets/athu1105/book-genre-prediction).

In [5]:
from pkg import data
from pkg import model

In [1]:
from pprint import pprint  # pretty-printer
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from skmultilearn.model_selection import IterativeStratification as MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import plotly.graph_objects as go
import itertools

In [2]:
def frequency_distribution(genres:pd.DataFrame, title='Frequency Distribution of Genres') -> pd.Series:
    """
    This function takes in a list of genres and returns a frequency distribution of the genres
    """
    genre_freq = pd.Series(genres).value_counts()
    genre_freq = genre_freq / genre_freq.sum()

    fig = go.Figure(data=[go.Bar(x=genre_freq.index, y=genre_freq.values)], layout_title_text=title, layout_xaxis_title_text='Genres', layout_yaxis_title_text='Frequency')
    fig.show()

    return genre_freq

def plot_large_confusion_matrix(y_true, y_pred, title='Confusion Matrix'):
    """
    This function takes in the true and predicted labels and plots a confusion matrix
    """
    cm = confusion_matrix(y_true, y_pred)
    cm = cm / cm.sum(axis=1)[:, np.newaxis]

    fig = go.Figure(data=[go.Heatmap(z=cm, x=[f'Predicted {i}' for i in range(cm.shape[1])], y=[f'True {i}' for i in range(cm.shape[0])])], layout_title_text=title, layout_xaxis_title_text='Predicted', layout_yaxis_title_text='True')
    fig.show()

In [3]:
books = data.Book()
embedding = data.Embedding(data=books, train=True)

frequency_distribution(books.genres)

NameError: name 'data' is not defined

In [96]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, precision_recall_curve

def train_model(X_train, y_train, model, param_grid, cv, scoring, n_jobs, verbose):
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=cv, scoring=scoring, n_jobs=n_jobs, verbose=verbose, n_iter=50, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro')
    recall = recall_score(y, y_pred, average='macro')
    f1 = f1_score(y, y_pred, average='macro')
    y_prob = model.predict_proba(X) if hasattr(model, "predict_proba") else model.decision_function(X)
    return y_pred, y_prob, accuracy, precision, recall, f1

def train_and_evaluate_model(X, y, model, param_grid, cv, scoring, n_jobs, verbose):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    best_model = train_model(X_train, y_train, model, param_grid, cv, scoring, n_jobs, verbose)
    y_pred, y_prob, accuracy, precision, recall, f1 = evaluate_model(best_model, X_test, y_test)
    # y_pred, y_prob, accuracy, precision, recall, f1, roc_auc = evaluate_model(best_model, X_test, y_test)
    # plot_roc_curve(y_test, y_prob)
    return best_model, y_pred, y_prob, accuracy, precision, recall, f1

def plot_roc_curve(y_true, y_prob, n_classes, title='ROC Curve'):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_true_bin = label_binarize(y_true, classes=list(range(n_classes)))
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fig = go.Figure()
    colors = itertools.cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines', name='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]), line=dict(color=color)))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash')))
    fig.update_layout(title=title, xaxis=dict(title='False Positive Rate'), yaxis=dict(title='True Positive Rate'), showlegend=True)
    fig.show()

def plot_precision_recall_curve(y_true, y_prob, n_classes, title='Precision-Recall Curve'):
    precision = dict()
    recall = dict()
    y_true_bin = label_binarize(y_true, classes=list(range(n_classes)))

    fig = go.Figure()
    colors = itertools.cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i], y_prob[:, i])
        fig.add_trace(go.Scatter(x=recall[i], y=precision[i], mode='lines', name='Precision-Recall curve of class {0}'.format(i), line=dict(color=color)))
    fig.update_layout(title=title, xaxis=dict(title='Recall'), yaxis=dict(title='Precision'), showlegend=True)
    fig.show()

def plot_confusion_matrix(y_true, y_pred, classes, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    fig = go.Figure(data=go.Heatmap(z=cm, x=classes, y=classes, colorscale='Blues'))
    fig.update_layout(title=title, xaxis=dict(title='Predicted'), yaxis=dict(title='True'))
    fig.show()

In [97]:
le = LabelEncoder()
X = embedding.doc_vectors
y = le.fit_transform(books.genres)

# logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
# rf = RandomForestClassifier(class_weight='balanced')
# svm = SVC(class_weight='balanced', probability=True)
# mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', early_stopping=True, validation_fraction=0.1, n_iter_no_change=10, tol=0.0001, verbose=True)
# xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

In [98]:
import warnings
warnings.filterwarnings('ignore')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000, class_weight='balanced'), 
                            {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
                                'logisticregression__penalty': ['l2']}),
    "Random Forest": (RandomForestClassifier(class_weight='balanced'), 
                        {'randomforestclassifier__n_estimators': [50, 100, 200], 
                        'randomforestclassifier__max_depth': [None, 10, 20], 
                        'randomforestclassifier__min_samples_split': [2, 5, 10]}),
    "SVM": (SVC(class_weight='balanced', probability=True), 
            {'svc__C': [0.1, 1, 10], 
                'svc__kernel': ['linear', 'rbf']}),
    "MLP": (MLPClassifier(max_iter=1000), 
            {'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50,50)], 
                'mlpclassifier__activation': ['tanh', 'relu'], 
                'mlpclassifier__solver': ['adam', 'sgd'], 
                'mlpclassifier__alpha': [0.0001, 0.05], 
                'mlpclassifier__learning_rate': ['constant','adaptive']})
}

results = {}
for model_name, (model, param_grid) in models.items():
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf = make_pipeline(StandardScaler(), model)
        best_model, y_pred, y_prob, accuracy, precision, recall, f1 = train_and_evaluate_model(X_train, y_train, clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        # roc_aucs.append(roc_auc)
    
    avg_accuracy = np.mean(accuracies)
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    # avg_roc_auc = np.mean(roc_aucs)
    
    results[model_name] = {
        "Average Accuracy": avg_accuracy,
        "Average Precision": avg_precision,
        "Average Recall": avg_recall,
        "Average F1 Score": avg_f1,
        # "Average ROC AUC": avg_roc_auc,
        "Best Estimator": best_model
    }


for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Average Accuracy: {result['Average Accuracy']}")
    print(f"Average Precision: {result['Average Precision']}")
    print(f"Average Recall: {result['Average Recall']}")
    print(f"Average F1 Score: {result['Average F1 Score']}")
    print(f"Average ROC AUC: {result['Average ROC AUC']}")
    print(f"Best Estimator: {result['Best Estimator']}\n")

Fitting 5 folds for each of 7 candidates, totalling 35 fits



The total space of parameters 7 is smaller than n_iter=50. Running 7 iterations. For exhaustive searches, use GridSearchCV.




The total space of parameters 7 is smaller than n_iter=50. Running 7 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 7 candidates, totalling 35 fits



The total space of parameters 7 is smaller than n_iter=50. Running 7 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 7 candidates, totalling 35 fits



The total space of parameters 7 is smaller than n_iter=50. Running 7 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 7 candidates, totalling 35 fits



The total space of parameters 7 is smaller than n_iter=50. Running 7 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 7 candidates, totalling 35 fits



The total space of parameters 27 is smaller than n_iter=50. Running 27 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 27 candidates, totalling 135 fits



The total space of parameters 27 is smaller than n_iter=50. Running 27 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 27 candidates, totalling 135 fits



The total space of parameters 27 is smaller than n_iter=50. Running 27 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 27 candidates, totalling 135 fits



The total space of parameters 27 is smaller than n_iter=50. Running 27 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 27 candidates, totalling 135 fits



The total space of parameters 27 is smaller than n_iter=50. Running 27 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 27 candidates, totalling 135 fits



The total space of parameters 6 is smaller than n_iter=50. Running 6 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 6 candidates, totalling 30 fits



The total space of parameters 6 is smaller than n_iter=50. Running 6 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 6 candidates, totalling 30 fits



The total space of parameters 6 is smaller than n_iter=50. Running 6 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 6 candidates, totalling 30 fits



The total space of parameters 6 is smaller than n_iter=50. Running 6 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits



The total space of parameters 6 is smaller than n_iter=50. Running 6 iterations. For exhaustive searches, use GridSearchCV.


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Fitting 5 folds for each of 48 candidates, totalling 240 fits



The total space of parameters 48 is smaller than n_iter=50. Running 48 iterations. For exhaustive searches, use GridSearchCV.


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


The total space of parameters 48 is smaller than n_iter=50. Running 48 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 48 candidates, totalling 240 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


The total space of parameters 48 is smaller than n_iter=50. Running 48 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 48 candidates, totalling 240 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.


The total space of parameters 48 is smaller than n_iter=50. Running 48 iterations. For exhaustive searches, use GridSearchCV.



Fitting 5 folds for each of 48 candidates, totalling 240 fits




Fitting 5 folds for each of 48 candidates, totalling 240 fits



Training interrupted by user.


The total space of parameters 48 is smaller than n_iter=50. Running 48 iterations. For exhaustive searches, use GridSearchCV.



In [None]:
# model.py
import numpy as np 
import pandas as pd
from typing import Tuple
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from pkg.data import Book, Embedding
