## import the libraries

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib

In [3]:
df = pd.read_csv("data/data.csv")

df =df.drop(columns = 'index')

In [4]:
df.head()

Unnamed: 0,title,genre,summary
0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...


In [5]:
# Clean the dataset
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AkshatRaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Clean the text by removing stopwords, punctuation, lower
def preprocess_text(data: pd.DataFrame, columns: str=None):
    if columns not in data.columns:
        raise Exception('This column does not exist in the dataframe')
    corpus = []
    stop_words = stopwords.words('english')
    for i in range(0, len(data)):
        summary = re.sub("[^A-Za-z]", " ", df[columns][i])
        summary = summary.lower()
        summary = summary.split()
        summary = [word for word in summary if not word in stop_words]
        ps = PorterStemmer()
        summary = [ps.stem(word) for word in summary]
        summary = " ".join(summary)
        corpus.append(summary)
    return corpus


corpus = preprocess_text(data=df, columns='summary')

In [7]:
# Converting the text to Bag of Words using TfidfVectorizer
tf_vectorizer =  TfidfVectorizer(max_features=38000)
tf_vectorizer.fit(corpus)
X_tf = tf_vectorizer.transform(corpus).toarray()

In [8]:
joblib.dump(tf_vectorizer, "encoder_vectorizer/tfidf_vectorizer.joblib")

['encoder_vectorizer/tfidf_vectorizer.joblib']

In [26]:
df['genre'].value_counts()

genre
thriller      1023
fantasy        876
science        647
history        600
horror         600
crime          500
romance        111
psychology     100
sports         100
travel         100
Name: count, dtype: int64

In [29]:
# Target variable
genre = df['genre'].value_counts()

encoder = LabelEncoder()
y_tf = encoder.fit_transform(df['genre'])

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 0)
# Seprating the dataset into training and test set


X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, test_size=0.25, random_state=0)
X_train_tf_res, y_train_res = sm.fit_resample(X_train_tf, y_train_tf)

In [30]:
joblib.dump(encoder, "encoder_vectorizer/encoder.joblib")

['encoder_vectorizer/encoder.joblib']

In [11]:
# # Training the model using different models

# models = {"Decision Tree" : DecisionTreeClassifier(),
#             "Xgboost": XGBClassifier(),
#             "RandomForest": RandomForestClassifier(),
#             "Gaussian": GaussianNB(),
#             "Multinomial": MultinomialNB(),
#             "Bernoulli": BernoulliNB(),
#             'Support Vector Machine': SVC()}

# accuracy = {}
# recall = {}
# confusion = {}
# precision = {}
# for name, model in models.items():
#     classifier = model.fit(X_train_tf, y_train_tf)
#     y_pred = classifier.predict(X_test_tf)
#     ac = accuracy_score(y_test_tf, y_pred)
#     cm = confusion_matrix(y_test_tf, y_pred)
#     re = recall_score(y_test_tf, y_pred, average='macro')
#     pre = precision_score(y_test_tf, y_pred, average='macro')
#     accuracy[name] = ac
#     recall[name] = re
#     confusion[name] = cm
#     precision[name] = pre



## Use XgboostClassifier

In [12]:
from sklearn import set_config

In [13]:
set_config(display='diagram')

In [34]:
xg = XGBClassifier(n_jobs = -1)
xg.fit(X_train_tf_res, y_train_res)

In [35]:
y_pred = xg.predict(X_test_tf)

ac_xg = accuracy_score(y_test_tf, y_pred)
print(xg)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, objective='multi:softprob', ...)


In [36]:
ac_xg

0.6540772532188841

In [37]:
joblib.dump(xg, "models/book_genre_prediction.joblib")

['models/book_genre_prediction.joblib']

In [38]:
def preprocess_text(text: str) -> str:
    """
    

    Parameters
    ----------
    text : str
        DESCRIPTION.

    Returns
    -------
    str
        DESCRIPTION.

    """
    text = re.sub("[^A-Za-z]", " ", text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    stop_words = stopwords.words('english')
    text = [ps.stem(word) for word in text if not word in set(stop_words)]
    text = " ".join(text)
    return text

In [39]:
text = "On the morning of their fifth wedding anniversary, Nick Dunne's wife, Amy, disappears. As the police investigation unfolds, Nick becomes the prime suspect. Told from alternating perspectives, this psychological thriller explores themes of deceit, manipulation, and the dark sides of marriage."

res = xg.predict(tf_vectorizer.transform([preprocess_text(text)]))
print(res)
encoder.inverse_transform(res)


[0]


array(['crime'], dtype=object)

In [10]:
ac_xg

0.6506437768240343

In [11]:
cm_xg = confusion_matrix(y_test_tf, y_pred)
cm_xg

array([[ 84,   4,   6,   3,   2,   0,   2,   0,  31,   0],
       [  2, 143,   8,  13,   0,   5,  17,   0,  17,   0],
       [  4,  13, 109,   7,   0,   0,   6,   0,  16,   3],
       [  3,  17,   9,  82,   0,   0,   7,   0,  31,   0],
       [  0,   0,   1,   0,   8,   0,   6,   1,   4,   0],
       [  0,   5,   0,   0,   0,   7,   1,   2,   9,   0],
       [  4,  16,  12,   8,   5,   0, 118,   0,   8,   0],
       [  0,   0,   1,   0,   0,   1,   0,  18,   5,   0],
       [ 22,   8,  16,  16,   0,   3,  13,   0, 179,   0],
       [  0,   2,   6,   0,   0,   0,   3,   0,   3,  10]], dtype=int64)

## Dump the model

In [None]:
# joblib.dump(tf_vectorizer, 'tfidf_vectorizer.joblib')
# joblib.dump(encoder, 'encoder.joblib')
# joblib.dump(xg, "book_genre_prediction_xg.joblib")

['book_genre_prediction_xg.joblib']

## Use BernoulliNB

In [15]:
bn = BernoulliNB()
bn.fit(X_train_tf, y_train_tf)

y_pred = bn.predict(X_test_tf)

ac_bn = accuracy_score(y_test_tf, y_pred)
ac_bn

0.44034334763948496

In [16]:
cm_bn = confusion_matrix(y_test_tf, y_pred)
cm_bn

array([[ 17,   0,   3,   2,   0,   0,   0,   0, 110,   0],
       [  0,  97,   3,   7,   0,   0,   7,   0,  91,   0],
       [  0,  16,  54,   4,   0,   0,   0,   0,  84,   0],
       [  0,  12,   3,  49,   0,   0,   1,   0,  84,   0],
       [  0,   0,   0,   0,   0,   0,   1,   0,  19,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0,  23,   0],
       [  0,   7,  10,   5,   0,   0,  60,   0,  89,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0,  24,   0],
       [  1,   3,   4,  11,   0,   0,   2,   0, 236,   0],
       [  0,   2,   0,   0,   0,   0,   0,   0,  22,   0]], dtype=int64)

In [None]:
joblib.dump(bn, "book_genre_prediction_bn.joblib")

['book_genre_prediction_bn.joblib']

## Gaussian NB

In [19]:
ng = GaussianNB()
ng.fit(X_train_tf, y_train_tf)

y_pred = ng.predict(X_test_tf)

ac_nb = accuracy_score(y_test_tf, y_pred)

In [20]:
ac_nb

0.48412017167381977

In [21]:
cm_gn = confusion_matrix(y_test_tf, y_pred)
cm_gn

array([[ 50,  10,  13,  16,   0,   0,   5,   0,  38,   0],
       [  2, 131,  17,  16,   0,   5,  13,   0,  21,   0],
       [  5,  16,  86,  11,   0,   0,  13,   0,  27,   0],
       [  5,  19,  20,  63,   0,   0,   8,   0,  34,   0],
       [  1,   3,   1,   1,   2,   0,   8,   0,   4,   0],
       [  0,   4,   3,   1,   0,   0,   2,   1,  13,   0],
       [  3,  32,  20,  11,   4,   0,  77,   0,  24,   0],
       [  1,   3,   2,   1,   0,   1,   0,   6,  11,   0],
       [ 23,  17,  25,  30,   0,   2,  12,   0, 148,   0],
       [  0,   2,  11,   3,   0,   0,   4,   0,   3,   1]], dtype=int64)

In [None]:
joblib.dump(ng, "book_genre_prediction_gaussian.joblib")

['book_genre_prediction_gaussian.joblib']

In [22]:
mn = MultinomialNB()
mn.fit(X_train_tf, y_train_tf)

y_pred = mn.predict(X_test_tf)

ac_mn = accuracy_score(y_test_tf, y_pred)

In [23]:
ac_mn

0.41974248927038627

In [24]:
cm_mn = confusion_matrix(y_test_tf, y_pred)
cm_mn

array([[  0,   1,   0,   0,   0,   0,   0,   0, 131,   0],
       [  0, 152,   0,   0,   0,   0,   4,   0,  49,   0],
       [  0,  17,  22,   0,   0,   0,   1,   0, 118,   0],
       [  0,  18,   1,  17,   0,   0,   0,   0, 113,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,  20,   0],
       [  0,   3,   0,   0,   0,   0,   0,   0,  21,   0],
       [  0,  13,   0,   0,   0,   0,  45,   0, 113,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,  25,   0],
       [  0,   4,   0,   0,   0,   0,   0,   0, 253,   0],
       [  0,   3,   0,   0,   0,   0,   0,   0,  21,   0]], dtype=int64)

In [None]:
joblib.dump(mn, "book_genre_prediction_multinomial.joblib")

['book_genre_prediction_multinomial.joblib']

## Support Vector Machine

In [None]:
C=1.0, kernel='linear', degree=3, gamma='auto'

In [None]:
sv = SVC(kernel="linear", C= 1, class_weight="balanced")
sv.fit(X_train_tf, y_train_tf)

y_pred = sv.predict(X_test_tf)

ac_sv = accuracy_score(y_test_tf, y_pred)
ac_sv

In [None]:
joblib.dump(sv, "book_genre_prediction_svc.joblib")

NameError: name 'sv' is not defined

In [None]:
# sv = SVC()
# sv.fit(X_train_tf, y_train_tf)

# y_pred = sv.predict(X_test_tf)

ac_sv = accuracy_score(y_test_tf, y_pred)
ac_sv

In [None]:
cm_sv = confusion_matrix(y_test_tf, y_pred)
cm_sv

array([[ 17,   0,   3,   2,   0,   0,   0,   0, 110,   0],
       [  0,  97,   3,   7,   0,   0,   7,   0,  91,   0],
       [  0,  16,  54,   4,   0,   0,   0,   0,  84,   0],
       [  0,  12,   3,  49,   0,   0,   1,   0,  84,   0],
       [  0,   0,   0,   0,   0,   0,   1,   0,  19,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0,  23,   0],
       [  0,   7,  10,   5,   0,   0,  60,   0,  89,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0,  24,   0],
       [  1,   3,   4,  11,   0,   0,   2,   0, 236,   0],
       [  0,   2,   0,   0,   0,   0,   0,   0,  22,   0]])

### Testing

In [None]:
test = df['summary'][200]

test_X = tf_vectorizer.transform([test])

In [None]:
xg = joblib.load("book_genre_prediction_xg.joblib")

In [None]:
xg.predict(test_X)[0]

3

 The plot concerns a young woman living in Ottawa named Jacky Rowan who, after a late-night encounter with a motorcycle-riding version of the Wild Hunt, picks up a red cap which enables her to see into the Faerie realms. She is soon drawn into a supernatural struggle between the weakened forces of the Seelie Court and their ominous enemies, the Host or Unseelie Court. She is regaled as the Jack of Kinrowan, a trickster figure who represents the Seelie Court's hope for victory against the forces of evil. With the help of her friend Kate Hazel and an array of faerie friends and allies she makes along the way (and a considerable amount of good luck), Jacky manages to rescue the kidnapped daughter of the Laird of Kinrowan and defeat the Unseelie Court, thus bringing peace and safety to the land.

In [None]:
test

" The plot concerns a young woman living in Ottawa named Jacky Rowan who, after a late-night encounter with a motorcycle-riding version of the Wild Hunt, picks up a red cap which enables her to see into the Faerie realms. She is soon drawn into a supernatural struggle between the weakened forces of the Seelie Court and their ominous enemies, the Host or Unseelie Court. She is regaled as the Jack of Kinrowan, a trickster figure who represents the Seelie Court's hope for victory against the forces of evil. With the help of her friend Kate Hazel and an array of faerie friends and allies she makes along the way (and a considerable amount of good luck), Jacky manages to rescue the kidnapped daughter of the Laird of Kinrowan and defeat the Unseelie Court, thus bringing peace and safety to the land."

In [None]:
encoder.inverse_transform(xg.predict(test_X))

array(['horror'], dtype=object)