# **Data Mining Techniques (Ex 2)**
# **| Readers and Books analysis, recommendation and classification system dev**

---






In [None]:
import os
import csv
import time
import urllib.request
import pandas as pd
import ast
import numpy as np
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

In [None]:
# Replace it with your path
DIR_PATH = "/content/gdrive/MyDrive/"

# path of the .csv file
file_path = DIR_PATH + "books_1.Best_Books_Ever.csv"

* Creating a dataframe from the .csv file.




In [None]:
# Set display options
pd.set_option('display.max_columns', None)    # Display all columns
pd.set_option('display.max_colwidth', None)   # Display full column width
pd.set_option('display.max_rows', None)       # Display all rows

# Read .csv file into DataFrame
df = pd.read_csv(file_path)

# Print the first 50 rows of the resulting DataFrame, as an example, to avoid RAM crash
df.head(50)

### Question 1: Data preprocessing

* Creating 5 additional columns in the DataFrame, containing the separate values of the "*ratingsByStars*" column for each star/category (-> "*ratingStar_1, ratingStar_2, ratingStar_3, ratingStar_4, ratingStar_5*").

In [None]:
# Split values in "ratingsByStars" column and assign to new columns "ratingStar_i", i=1...5
split_columns = [f'ratingStar_{i+1}' for i in range(5)]
ratings_df = pd.DataFrame(df['ratingsByStars'].str.split(',', expand=True), columns=split_columns)
df1 = df.copy()
df1 = pd.concat([df, ratings_df], axis=1)

# Print the first 50 rows of the resulting DataFrame, as an example, to avoid RAM crash
df1.head(50)

* Creating an additional column "*genreSingle*" where we store the first genre value from the original "*genre*" column.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df2 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df2.dropna(subset=['genres'], inplace=True)

# Extract the first element from the "genres" column and save it into a new column "genreSingle"
df2['genreSingle'] = df2['genres'].str.split(',').str[0].str.strip().str.replace(r'\[', '', regex=True)

# Print the first 50 rows of the resulting DataFrame, as an example, to avoid RAM crash
df2.head(50)

* Creating an additional column "*publishDate*" where we store the publication date of the book, based on the original "*publishDate*" column.

In [None]:
import warnings
import pandas as pd

df3 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df3 = df3.dropna(subset=['publishDate'])

# Convert "publishDate" column to datetime format
df3['publishDate'] = pd.to_datetime(df3['publishDate'], errors='coerce')

# Extract the year from "publishDate" and save it in "PublishYear" column
df3.loc[:, 'PublishYear'] = df3['publishDate'].dt.year

# Print the first 50 rows of the resulting DataFrame, as an example, to avoid RAM crash
df3.head(50)

### Question 2: Extracting data quering the DataFrame

Before data processing in each sub-question, we create a new copy of the original dataframe. In this copy, we drop the rows for which there are NaN values in the corresponding columns we are processing. Otherwise, if we indiscriminately drop the rows for which there are NaN values in any column or the column we are processing, we encounter the following problem:
When we reuse the same modified dataframe in a subsequent question, we may lose data rows from before, even though in our new question, the processing column would not have any NaN values!

2.   Find the 10 books with the most pages
     (Ποια είναι τα 10 βιβλία με τις περισσότερες σελίδες).

In [None]:
import numpy as np
import matplotlib.pyplot as plt

df_2_2 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df_2_2.dropna(subset=['title', 'pages'], inplace=True)

# Convert 'pages' column to numeric dtype
df_2_2['pages'] = pd.to_numeric(df_2_2['pages'], errors='coerce')

# Sort the dataframe by 'pages' column in descending order and select the top 10
top_10_books = df_2_2.sort_values('pages', ascending=False).head(10)

# Generate a color palette with a unique color for each book title
colors = plt.cm.tab10(np.arange(len(top_10_books)))

# Plot the histogram with colored bars
plt.figure(figsize=(10, 10))
plt.bar(top_10_books['title'], top_10_books['pages'], color=colors)

# Customize the plot
plt.title('Top 10 Books with Most Pages')
plt.xlabel('Book Title')
plt.ylabel('Number of Pages')
plt.xticks(rotation=45, ha='right', fontsize=8)     # Rotate and align x-axis labels
plt.ticklabel_format(style='plain', axis='y')       # Display y-axis values in plain format

# Add total number of pages above each bar
for i, pages in enumerate(top_10_books['pages']):
    plt.text(i, pages, int(pages), ha='center', va='bottom', fontsize=8)

# Plotting
plt.tight_layout()
plt.show()



5.   Find the 10 writers with the most books in their writing history
     (Ποιοι είναι οι 10 συγγραφεις με τα περισσότερα βιβλία).



In [None]:
df_2_5 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df_2_5.dropna(subset=['title', 'author'], inplace=True)

# Group the dataframe by writer and count the number of books
writer_counts = df_2_5['author'].value_counts().nlargest(10)

# Generate a color palette with a unique color for each author
colors = plt.cm.tab10(np.arange(len(writer_counts)))

# Plot the histogram with colored bars
plt.figure(figsize=(10, 6))
writer_counts.plot(kind='bar', color=colors)

# Customize the plot
plt.title('Top 10 Authors with Most Books')
plt.xlabel('Author')
plt.ylabel('Number of Books')
plt.xticks(rotation=45, ha='right', fontsize=8)  # Rotate and align x-axis labels
plt.yticks(fontsize=8)                           # Adjust y-axis tick font size

# Add total book count above each bar for better visualisation
for i, count in enumerate(writer_counts):
    plt.text(i, count, count, ha='center', va='bottom', fontsize=8)

# Plotting
plt.tight_layout()
plt.show()

6.   Find the 10 writers with the most critics in their writing history
     (Ποιοι είναι οι 10 συγγραφεις με τις περισσότερες κριτικές για τα βιβλία τους).

In [None]:
df_2_6 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df_2_6.dropna(subset=['author', 'numRatings'], inplace=True)

# Create a copy of the DataFrame
df_copy = df_2_6.copy()

# Split the author column by comma and extract the first part
# We want only the name of the author, we must ignore the names of the editor, illustrator, etc. if any
df_copy['author_name'] = df_copy['author'].str.split(',').str[0]

# Group the dataframe by first name and sum the critics
writer_total_critics = df_copy.groupby('author_name')['numRatings'].sum().nlargest(10)

# Generate a color palette with a unique color for each author
colors = plt.cm.tab10(np.arange(len(writer_total_critics)))

# Plot the histogram with colored bars
plt.figure(figsize=(10, 10))
writer_total_critics.plot(kind='bar', color=colors)

# Customize the plot
plt.title('Top 10 Authors with Most Critics')
plt.xlabel('Author')
plt.ylabel('Number of Critics')
plt.xticks(rotation=45, ha='right', fontsize=8)     # Rotate and align x-axis labels
plt.ticklabel_format(style='plain', axis='y')       # Display y-axis values in plain format

# Add total number of critics on top of each bar
for i, count in enumerate(writer_total_critics):
    plt.text(i, count, count, ha='center', va='bottom', fontsize=8)

# Plotting
plt.tight_layout()
plt.show()

# Delete the df copy, we used it only for data mining and plotting
del df_copy

8.   Find the languages mostly used in the books
     (Ποιες είναι οι πιο συχνές γλώσσες συγγραφής των βιβλίων + φθίνουσα κατάταξη αυτών).

In [None]:
import random

df_2_8 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df_2_8.dropna(subset=['title', 'language'], inplace=True)

# Count the number of books for each language
language_counts = df_2_8['language'].value_counts()

# Get the unique languages and their corresponding counts
languages = language_counts.index
counts = language_counts.values

# Generate random colors with number equal to the number of languages existing in the dataframe
random_colors = ['#%06x' % random.randint(0, 0xFFFFFF) for _ in range(len(languages))]

# Plot the color plot with different colors for each bar, to better visualise data
plt.figure(figsize=(10, 10))

# Adjust the width of the bars dynamically based on the number of languages and add padding on both sides
bar_width = 1 / (len(languages) + 2)

bars = plt.bar(range(len(languages)), counts, width=0.6)

# Assign different colors to each bar
for i, bar in enumerate(bars):
    bar.set_color(random_colors[i])

# Set y-axis scale to logarithmic for better visualization of small and very large numbers
plt.yscale('log')

# Customize the plot
plt.title('Number of Books per Language')
plt.xlabel('Language')
plt.ylabel('Number of Books')

# Rotate and align x-axis labels
plt.xticks(range(len(languages)), languages, rotation=90, ha='center', fontsize=8)

# Add the count above each bar
for i, count in enumerate(counts):
    plt.text(i, count, count, ha='center', va='bottom', fontsize=8, rotation=90)

# Plotting
plt.tight_layout()
plt.show()

9.   Find the 10 publishers with the most publications
     (Ποιοι είναι οι 10 εκδότες με τις περισσότερες εκδόσεις βιβλίων).

In [None]:
df_2_9 = df.copy()

# Remove rows with NaN values in the columns that we are going to process in the sub-question
df_2_9.dropna(subset=['publisher'], inplace=True)

# Group the dataframe by publisher and count the publications
publisher_publications = df_2_9['publisher'].value_counts().nlargest(10)

# Generate a color palette with a unique color for each publisher
colors = plt.cm.tab10(np.arange(len(publisher_publications)))

# Plot the histogram with colored bars
plt.figure(figsize=(10, 10))
publisher_publications.plot(kind='bar', color=colors)

# Customize the plot
plt.title('Top 10 Publishers with Most Publications')
plt.xlabel('Publisher')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45, ha='right', fontsize=8)     # Rotate and align x-axis labels
plt.ticklabel_format(style='plain', axis='y')       # Display y-axis values in plain format

# Add the number of publications on top of each bar
for i, count in enumerate(publisher_publications):
    plt.text(i, count, count, ha='center', va='bottom', fontsize=8)

# Plotting
plt.tight_layout()
plt.show()

# free memory
del df_2_2, df_2_5, df_2_6, df_2_8, df_2_9

### Question 3: Recommendation System (RS)

In [None]:
import random # import again for code execution independence

# Creating the new DataFrame df_RS, by keeping only the 'bookId'
# and 'description' columns as well as the rows where 'language'="English"

df_RS_all = df.loc[df['language'] == 'English', ['bookId', 'description']]

# Print the first 50 rows of the resulting DataFrame, as an example, to avoid RAM crash
df_RS_all.head(50)

The TF-IDF matrix (Term Frequency-Inverse Document Frequency) is a mathematical/numeric representation of text that provides information about the rarity of words and structures within the broader context of the text.

* Term Frequency (TF): It measures the frequency of occurrence of a term (e.g., words) within the text. When the TF of a term is high, it means that the term appears more frequently in the text.

* Inverse Document Frequency (IDF): It is a measure of the rarity of a term, not in a single document, but in a collection/group of texts (e.g., 100 books in a library). The higher the IDF value of a term, the rarer it is within the collection of texts, and therefore, it gains more importance in various data mining processes since its occurrence is considered "rare" and each finding carries greater "significance."

The TF-IDF score of each term is equal to the product of these two values (TF * IDF). The TF-IDF matrix consists of rows corresponding to different texts/documents and columns corresponding to different terms that appear in the collection (in some or all of the documents).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove rows with NaN values in the 'description' column
df_RS_all.dropna(subset=['description'], inplace=True)

# Reduce the dataset to avoid RAM crash
df_RS = df_RS_all.head(15000)

# Extract the 'description' column from the DataFrame
descriptions = df_RS['description'].tolist()

# UNIGRAMS -> Create an instance of TfidfVectorizer with ngram_range=(1, 1) and default stop words for the English language
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
tfidf_matrix_unigrams = vectorizer.fit_transform(descriptions)

# BIGRAMS -> Create an instance of TfidfVectorizer with ngram_range=(2, 2) and default stop words for the English language
vectorizer = TfidfVectorizer(ngram_range=(2, 2), stop_words='english')
tfidf_matrix_bigrams = vectorizer.fit_transform(descriptions)

* UNIGRAMS 100 most similar books

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

titles = df['title'].tolist()   # we want this to print the 'title' of the book

# Calculate cosine similarity on the UNIGRAMS TF-IDF matrix
cosine_sim_unigrams = cosine_similarity(tfidf_matrix_unigrams, tfidf_matrix_unigrams)

print("> UNIGRAMS: 100 most similar books")
print("------------------------------------\n")

# Get the indices of the top 100 books after sorting in descending order
similar_books_indices_unigrams = np.argsort(-cosine_sim_unigrams)[:,:101]

# Get the indices of the top 100 similar books excluding the self-similar ones (diagonal)
similar_books_indices_unigrams = similar_books_indices_unigrams[:, 1:]

# Flatten the indices array to process them next
similar_books_indices_unigrams = similar_books_indices_unigrams.flatten()

# Get unique indices and keep only the first 100 indices
similar_books_indices_unigrams = np.unique(similar_books_indices_unigrams)[:100]

# Create a dictionary to store the most similar books
similar_books_uni_dict = {}

# Print the most similar books using unigrams
for book_index in similar_books_indices_unigrams:
    similar_books_uni_dict[book_index] = titles[book_index]
    print(titles[book_index])

* BIGRAMS 100 most similar books

In [None]:
# Calculate cosine similarity on the BIGRAMS TF-IDF matrix
cosine_sim_bigrams = cosine_similarity(tfidf_matrix_bigrams, tfidf_matrix_bigrams)

print("> BIGRAMS: 100 most similar books")
print("------------------------------------\n")

# Get the indices of the top 100 books after sorting in descending order
similar_books_indices_bigrams = np.argsort(-cosine_sim_bigrams)[:,:101]

# Get the indices of the top 100 similar books excluding the self-similar ones (diagonal)
similar_books_indices_bigrams = similar_books_indices_bigrams[:, 1:]

# Flatten the indices array to process them next
similar_books_indices_bigrams = similar_books_indices_bigrams.flatten()

# Get unique indices and keep only the first 100 indices
similar_books_indices_bigrams = np.unique(similar_books_indices_bigrams)[:100]

# Create a dictionary to store the most similar books
similar_books_bi_dict = {}

# Print the most similar books using unigrams
for book_index in similar_books_indices_bigrams:
    similar_books_bi_dict[book_index] = titles[book_index]
    print(titles[book_index])

Unigrams and bigrams are levels of tokenization in language text processing that can be used to draw conclusions, such as linguistic associations or the recognition of lexical patterns.

Unigrams refer to the subdivision of the text into individual tokens (words). For example, the sentence "A comet fell to Earth" consists of 5 unigrams: "A", "comet", "fell", "to", "Earth".

Bigrams, on the other hand, involve dividing a text into tokens that consist of pairs of consecutive words. In the previous example, the bigrams would be: "A comet", "comet fell", "fell to", "to Earth".

* Functions for computing the N most similar books to a specific book using UNIGRAMS and BIGRAMS.

In [None]:
# function to find the N-most similar books, when a specific book is given as input, using UNIGRAMS
def unigrams_recommend(bookId, N):

    # beacuse 1 of the N boos is the book itself, in the 'for' loop we ignore it
    # so we need to search for 1 more
    N += 1

    # Get the index of the book in the DataFrame
    book_index = df_RS.loc[df['bookId'] == bookId].index[0]

    # Create a TF-IDF matrix for unigrams
    vectorizer = TfidfVectorizer()
    tfidf_matrix_unigrams = vectorizer.fit_transform(df_RS['description'])

    # Calculate cosine similarity on the unigrams TF-IDF matrix
    cosine_sim_unigrams = cosine_similarity(tfidf_matrix_unigrams, tfidf_matrix_unigrams)

    # Get the indices and similarity scores of the most similar books
    similar_books_indices = np.argsort(-cosine_sim_unigrams[book_index])[:N]
    similarity_scores = cosine_sim_unigrams[book_index][similar_books_indices]

    # Print the recommendation results
    blank = ""
    print(f"Recommending {N-1} books similar to: ' {book_title} '\n")

    for i in range(N):
        recommended_book_index = similar_books_indices[i]

        # Skip the self-similar book
        if recommended_book_index == book_index:
            continue

        recommended_book_title = df.loc[recommended_book_index, 'title']
        recommended_book_description = df.loc[recommended_book_index, 'description']
        similarity_score = similarity_scores[i]
        print(f"{i}. Recommended: {recommended_book_title}")
        print(f"   Description: {recommended_book_description}")
        print(f"   (score: {similarity_score})\n")

    print("---------------------------------------------------------\n")


# test UNIGRAM recommending function
print("----------------------- UNIGRAMS recommendation function -----------------------\n")
random_bookIds = df_RS['bookId'].sample(n=5, random_state=42).tolist()

# Testing the recommend_similar_books function for each random bookId
count = 1
for bookId in random_bookIds:
    # Find the title of the book with the given bookId in the df DataFrame
    book_title = df.loc[df['bookId'] == bookId, 'title'].values[0]
    print(f"> ({count})")
    print(f"Random book selection: ' {book_title} '")

    # recommend N-random similar books for a random book
    unigrams_recommend(bookId, random.randint(3, 10))
    count += 1

In [None]:
# function to find the N-most similar books, when a specific book is given as input, using BIGRAMS
def bigrams_recommend(bookId, N):

    # beacuse 1 of the N boos is the book itself, in the 'for' loop we ignore it
    # so we need to search for 1 more
    N += 1

    # Get the index of the book in the DataFrame
    book_index = df_RS.loc[df['bookId'] == bookId].index[0]

    # Create a TF-IDF matrix for unigrams
    vectorizer = TfidfVectorizer()
    tfidf_matrix_bigrams = vectorizer.fit_transform(df_RS['description'])

    # Calculate cosine similarity on the unigrams TF-IDF matrix
    cosine_sim_bigrams = cosine_similarity(tfidf_matrix_bigrams, tfidf_matrix_bigrams)

    # Get the indices and similarity scores of the most similar books
    similar_books_indices = np.argsort(-cosine_sim_bigrams[book_index])[:N]
    similarity_scores = cosine_sim_bigrams[book_index][similar_books_indices]

    # Print the recommendation results
    print(f"   Recommending {N-1} books similar to: ' {book_title} '\n")

    for i in range(N):
        recommended_book_index = similar_books_indices[i]

        # Skip the self-similar book
        if recommended_book_index == book_index:
            continue

        recommended_book_title = df.loc[recommended_book_index, 'title']
        recommended_book_description = df.loc[recommended_book_index, 'description']
        similarity_score = similarity_scores[i]
        print(f"{i}. Recommended: {recommended_book_title}")
        print(f"   Description: {recommended_book_description}")
        print(f"   (score: {similarity_score})\n")

    print("---------------------------------------------------------\n")


# test BIGRAM recommending function
print("----------------------- BIGRAMS recommendation function -----------------------\n")
random_bookIds = df_RS['bookId'].sample(n=5, random_state=30).tolist()

# Testing the recommend_similar_books function for each random bookId
count = 1
for bookId in random_bookIds:
    # Find the title of the book with the given bookId in the df DataFrame
    book_title = df.loc[df['bookId'] == bookId, 'title'].values[0]
    print(f"> ({count})")
    print(f"Random book selection: ' {book_title} '")

    # recommend N-random similar books for a random book
    unigrams_recommend(bookId, random.randint(3, 10))
    count += 1

### Question 4: Classification
First, data preprocessing is performed for clustering. Specifically, a new dataframe is created with the columns "*bookId*", "*description*", and "*genreSingle*" that contains the rows from the original dataframe with genres from the top 10. Then, the characters in the "*description*" are converted to lowercase, and punctuation marks and leading/trailing whitespace are removed from the "*description*".

In [None]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import pickle

# Download most popular nltk packages
nltk.download('popular', quiet=True)

# Use df2 since it contains genreSingle
# Remove rows that contain the `]` character
df2 = df2[~df2['genreSingle'].str.contains(']')]

# Find the top 10 genres
top_10_genres = df2['genreSingle'].value_counts().nlargest(10).index.tolist()

# Define the meanings for each category
genre_meanings = {
    genre: genre for genre in top_10_genres
}

# Create a new dataframe that contains the most 10 book genres
clf_df = df2[df2['genreSingle'].isin(top_10_genres)].copy()

# Reduce amount of data
clf_df = clf_df.head(15000)

# Keep only bookId, description and genreSingle
clf_df = clf_df[['bookId', 'description', 'genreSingle']]

# Clean data of description column
def clean_description(description):
    description = re.sub(r'\W', ' ', str(description))    # remove all special characters
    description = description.lower()                     # convert description to lowercase
    description = re.sub(r'\s+', ' ', description)        # remove extra spaces
    return description

clf_df['description'] = clf_df['description'].apply(clean_description)

Then, the descriptions are converted into lists of words, with which we create a word2vec model. This model contains 300-dimensional vectors for each description, and we find the average of these vectors for each description. Finally, we use the pickle library to store the results in pkl files, so that if they already exist, the computations won't be repeated. This means that for any data updates, the .pkl files will need to be deleted first.

In [None]:
# Tokenize descriptions with word_tokenize function
clf_df['description'] = clf_df['description'].apply(lambda x: nltk.tokenize.word_tokenize(x))


# Check if the model is already trained and saved
if os.path.exists('w2v_model.pkl'):
    with open('w2v_model.pkl', 'rb') as file:
        w2v_model = pickle.load(file)

else:
    # Train the Word2Vec model
    w2v_model = Word2Vec(clf_df['description'], vector_size=300, window=5, min_count=1, workers=4)

    # Save the model for later use
    with open('w2v_model.pkl', 'wb') as file:
        pickle.dump(w2v_model, file)

# Calculate the average feature vector for each description
def average_word_vectors(words, model, vocabulary, num_features):
    # Create an array of zeroes
    feature_vector = np.zeros((num_features, ), dtype="float64")
    nwords = 0.

    # Sum and number of words
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    # Find and return the average
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector


# Check if the feature vectors are already calculated and saved
if os.path.exists('description_features.pkl'):
    with open('description_features.pkl', 'rb') as file:
        description_features = pickle.load(file)

else:
    # Convert list of words from w2v model to a set for better perfomance
    vocabulary = set(w2v_model.wv.index_to_key)

    # Calculate the average for each tokenized description
    features = [average_word_vectors(tokenized_sentence, w2v_model, vocabulary, 300)
                    for tokenized_sentence in clf_df['description']]

    # Convert np array to Dataframe
    description_features = pd.DataFrame(features)

    # Save the feature vectors for later use
    with open('description_features.pkl', 'wb') as file:
        pickle.dump(description_features, file)

To split the data into train and test sets, we first need to convert the categorical values in the "*genreSingle*" column into numerical values. Then, we execute three different classification methods in the following order: Naive Bayes, SVM, and Random Forests. Specifically, for the SVM method, the GridSearchCV() function is used to select the appropriate combination of parameters from the initial set of parameters provided. After the calculations are completed, the results of the methods are printed, showing the precision, recall, and f1-score achieved by the models for each "*genre*".

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# Print the category and its meaning for better reference in the following printings
i = 0
for category in top_10_genres:
    meaning = genre_meanings.get(category, 'Unknown')
    print(f"Category {i}: {meaning}")
    i += 1

# Create a label encoder object
le = LabelEncoder()

# Fit the encoder to the 'genreSingle' column
clf_df['genreSingle_encoded'] = le.fit_transform(clf_df['genreSingle'])

# Drop the original 'genreSingle' column
clf_df = clf_df.drop('genreSingle', axis=1)

# Split train and test sets (80-20) from genreSingle_encoded data
x = description_features            # all columns except genreSingle_encoded
y = clf_df['genreSingle_encoded']   # the newly created numeric genreSingle_encoded column

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Naive Bayes
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_predictions = nb.predict(x_test)

# Support Vector Machines
parameters = {'kernel': ['linear', 'rbf'], 'gamma': [0.01, 0.1, 1, 10], 'C': [0.01, 0.1, 1, 10]}
svc = SVC()

# Use GridSearchCV to decide which parameters perform better
clf = GridSearchCV(svc, parameters, cv=10)
clf.fit(x_train, y_train)
svm_predictions = clf.predict(x_test)

# Random Forests
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
rf_predictions = rf.predict(x_test)

# Naive Bayes evaluation
print("\n----------------------------Naive Bayes evaluation----------------------------\n")
nb_report = classification_report(y_test, nb_predictions)
print(nb_report)

# SVM evaluation
print("\n----------------------------SVM evaluation----------------------------\n")
svm_report = classification_report(y_test, svm_predictions)
print(svm_report)

# Random Forest evaluation
print("\n----------------------------Random Forest evaluation----------------------------\n")
rf_report = classification_report(y_test, rf_predictions)
print(rf_report)

# 10-fold Cross Validation for each model
models = [nb, clf, rf]
model_names = ['Naive Bayes', 'SVM', 'Random Forest']

# 10-fold Cross Validation for each model
for i, model in enumerate(models):
    print("\n---------------------10-fold Cross Validation for", model_names[i], "---------------------\n")

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

        scores_precision = cross_val_score(model, description_features, clf_df['genreSingle_encoded'], cv=10, scoring='precision_macro')
        print("Precision: %0.2f (+/- %0.2f)" % (scores_precision.mean(), scores_precision.std() * 2))

        scores_recall = cross_val_score(model, description_features, clf_df['genreSingle_encoded'], cv=10, scoring='recall_macro')
        print("Recall: %0.2f (+/- %0.2f)" % (scores_recall.mean(), scores_recall.std() * 2))

        scores_f1 = cross_val_score(model, description_features, clf_df['genreSingle_encoded'], cv=10, scoring='f1_macro')
        print("F1-score: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        scores_accuracy = cross_val_score(model, description_features, clf_df['genreSingle_encoded'], cv=10, scoring='accuracy')  #, error_score='raise'
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores_accuracy.mean(), scores_accuracy.std() * 2))
