In [4]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt')


from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, HistGradientBoostingClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# !pip install imbalanced-learn

### Data preprocessing

In [None]:
df = pd.read_excel("merged-armenian-books-dataset.xlsx")
df.head(200)

In [4]:
# Function to extract numeric value
def extract_numeric(price_with_currency):
    if isinstance(price_with_currency, str):
        parts = price_with_currency.split()
        numeric_part = parts[0]
        return numeric_part
    else:
        return None

# Function to extract text
def extract_text(reader):
    if isinstance(reader, str):
        return re.sub(r'\(\d+\)', '', reader).strip()
    else:
        return None

# Applying the function to the 'Price' column
df['Price'] = df['Price'].apply(extract_numeric)

# Converting the 'Price' column to numeric type, coercing errors to NaN
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Converting numeric values to integer type
df['Price'] = df['Price'].astype('Int64')

# Converting the 'Pages' column to numeric type, coercing errors to NaN
df['Pages'] = pd.to_numeric(df['Pages'], errors='coerce')

# Converting numeric values to integer type
df['Pages'] = df['Pages'].astype('Int64')

# Applying the function to the 'Reader' column
df['Reader'] = df['Reader'].apply(extract_text)

df['Number of Ratings'] = df['Number of Ratings'].apply(extract_numeric)

# Removing non-numeric characters (including commas) and convert to integer
df['Number of Ratings'] = df['Number of Ratings'].str.replace(r'\D', '', regex=True).astype('Int64')

df['Number of reviews'] = df['Number of reviews'].apply(extract_numeric)

df['Number of reviews'] = df['Number of reviews'].str.replace(r'\D', '', regex=True).astype('Int64')

df["Author"] = df["Author"].apply(extract_text)

# Removing content in brackets
df['Title'] = df['Title'].str.replace(r'\([^()]*\)', '', regex=True)

df["Description"] = df["Description"].str.replace(r'\([^()]*\)', '', regex=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6640 entries, 0 to 6639
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              6640 non-null   object 
 1   Author             6581 non-null   object 
 2   Price              1257 non-null   Int64  
 3   More Info          332 non-null    object 
 4   Description        5899 non-null   object 
 5   Genre              6640 non-null   object 
 6   Publisher          1629 non-null   object 
 7   Pages              1566 non-null   Int64  
 8   Reader             871 non-null    object 
 9   Avg. Rating        2719 non-null   float64
 10  Number of Ratings  2719 non-null   Int64  
 11  Number of reviews  2719 non-null   Int64  
dtypes: Int64(4), float64(1), object(7)
memory usage: 648.6+ KB


In [5]:
# Converting the 'More Info' column to strings
df['More Info'] = df['More Info'].astype(str)

# Extracting information for 'Publisher', 'Year of Publishing', 'Language', and 'Age group'
df['Publisher'] = df['More Info'].str.extract(r'ւթյուն-(.*?)-EAN', expand=False)
df['Year of Publishing'] = df['More Info'].str.extract(r'Տարեթիվ-(\d{4})-', expand=False)
df['Language'] = df['More Info'].str.extract(r'Լեզու-(.*?)-', expand=False)
df['Age group'] = df['More Info'].str.extract(r'Տարիք-(.*?)(?:-|$)', expand=False)
df['Pages'] = df['More Info'].str.extract(r'Էջեր-(\d+)', expand=False)

# Filling missing values with NaN
df[['Publisher', 'Year of Publishing', 'Language', 'Age group', 'Pages']] = df[['Publisher', 'Year of Publishing', 'Language', 'Age group', 'Pages']].replace('', pd.NA)

# Droping the 'More Info' column as it's no longer needed
df.drop(columns=['More Info'], inplace=True)

In [6]:
df.head(30)

Unnamed: 0,Title,Author,Price,Description,Genre,Publisher,Pages,Reader,Avg. Rating,Number of Ratings,Number of reviews,Year of Publishing,Language,Age group
0,Վեպ,Հովակիմյան Հովհաննես,,Հիսուսի կատուն,Ժամանակակից գրականություն,,,,,,,,,
1,«Ամառ առանց լուսաբաց»-ի թղթե թղթե թերթիկ,Agop J. Hacikyan,,Միջազգային բեսթսելլերի վերաթողարկում՝ ի հիշատա...,Վեպ,,,,4.4,78.0,13.0,,,
2,«Հինգի ակումբի» ճանապարհորդությունը,Էնիդ Բլայտոն,,««Հինգի ակումբի» ճանապարհորդությունը» անգլիացի...,Մանկական գրականություն,,,Կարինե Հովհաննիսյան,,,,,,
3,«Սրտով մարդը լեռնաշխարհում» և այլ վաղ պատմությ...,Վիլյամ Սարոյան,,«Սրտով մարդը լեռնաշխարհում» գրքում հավաքված են...,Դասական գրականություն,,,,4.31,218.0,11.0,,,
4,«Փարիզի ժամերի» կոշտ կազմը,Alex George,,Մի օր լույսերի քաղաքում Մի գիշեր կորցրած ժաման...,Ժամանակակից գրականություն,,,,3.7,14189.0,1991.0,,,
5,10 հայ ականավոր թագուհիներ,Արտակ Մովսիսյան,,,Պատմվածք,,,,4.44,9.0,3.0,,,
6,100 Սոցիալական Նորարարություններ Ֆինլանդիայից ...,Ilkka Taipale,,"Կարելի է մտածել, թե ինչ ընդհանուր բան ունեն խո...",Ոչ գեղարվեստական գրականություն,,,,3.55,99.0,14.0,,,
7,12-ից հետո կհանդիպենք,Պետրոսյան Խ․,,Խեղված ողնաշարը երկրի,Ժամանակակից գրականություն,,,,,,,,,
8,12-ից հետո կհանդիպենք,Պետրոսյան Խ․,,12-ից հետո կհանդիպենք,Ժամանակակից գրականություն,,,,,,,,,
9,150 Հոբելյանական ընտրանի,Հովհաննես Թումանյան,6900.0,«Ընտրանին» պատրաստվել է հայ գրականության դասակ...,Դասական գրականություն,,,,,,,,,


In [7]:
# Finding duplicates based on both 'Title' and 'Author'
duplicate_combination = df.duplicated(subset=['Title', 'Author'], keep=False)

# Subseting the DataFrame to show only the duplicates
duplicates_df_combination = df[duplicate_combination]

duplicates_df_combination

Unnamed: 0,Title,Author,Price,Description,Genre,Publisher,Pages,Reader,Avg. Rating,Number of Ratings,Number of reviews,Year of Publishing,Language,Age group
7,12-ից հետո կհանդիպենք,Պետրոսյան Խ․,,Խեղված ողնաշարը երկրի,Ժամանակակից գրականություն,,,,,,,,,
8,12-ից հետո կհանդիպենք,Պետրոսյան Խ․,,12-ից հետո կհանդիպենք,Ժամանակակից գրականություն,,,,,,,,,
16,20000 լյո ջրի տակ,Ժյուլ Վեռն,,Վերադարձ,Դասական գրականություն,,,,,,,,,
17,20000 լյո ջրի տակ,Ժյուլ Վեռն,,Սիդհարթա,Դասական գրականություն,,,,,,,,,
166,Ագնես,Պետեր Շտամ,2400,Սովորական թվացող սիրավեպն անկանխատեսելի ընթացք...,Սիրավեպ,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6554,Ջերմ մարմիններ,Isaac Marion,,Այժմ հիմնական շարժանկարը Summit Entertainment-...,Սիրավեպ,,,,3.95,45932,6493,,,
6579,Մեռած որպես դռան մեխ,Charlaine Harris,,Smalltown կոկտեյլ մատուցողուհի Սուկի Սթեքհաուս...,Սիրավեպ,,,,3.88,581723,13910,,,
6591,Այս սնամեջ ուխտերը,Lexi Ryan,,New York Times-ի բեսթսելերների հեղինակ Լեքսի Ռ...,Սիրավեպ,,,,4.20,58309,2681,,,
6602,Լուսին կանչեց,Patricia Briggs,,Mercedes Thompson մականունով Mercy-ը Volkswage...,Սիրավեպ,,,,3.80,254592,7324,,,


In [8]:
# Finding duplicates based only on 'Title'
duplicate_title = df.duplicated(subset=['Title'], keep=False)

# Subseting the DataFrame to show only the duplicates
duplicates_df_title = df[duplicate_title]

duplicates_df_title

Unnamed: 0,Title,Author,Price,Description,Genre,Publisher,Pages,Reader,Avg. Rating,Number of Ratings,Number of reviews,Year of Publishing,Language,Age group
7,12-ից հետո կհանդիպենք,Պետրոսյան Խ․,,Խեղված ողնաշարը երկրի,Ժամանակակից գրականություն,,,,,,,,,
8,12-ից հետո կհանդիպենք,Պետրոսյան Խ․,,12-ից հետո կհանդիպենք,Ժամանակակից գրականություն,,,,,,,,,
16,20000 լյո ջրի տակ,Ժյուլ Վեռն,,Վերադարձ,Դասական գրականություն,,,,,,,,,
17,20000 լյո ջրի տակ,Ժյուլ Վեռն,,Սիդհարթա,Դասական գրականություն,,,,,,,,,
28,731 օր քեզ համար,Տա Թևեր,,Տիեզերական սագա,Ժամանակակից գրականություն,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6579,Մեռած որպես դռան մեխ,Charlaine Harris,,Smalltown կոկտեյլ մատուցողուհի Սուկի Սթեքհաուս...,Սիրավեպ,,,,3.88,581723,13910,,,
6591,Այս սնամեջ ուխտերը,Lexi Ryan,,New York Times-ի բեսթսելերների հեղինակ Լեքսի Ռ...,Սիրավեպ,,,,4.20,58309,2681,,,
6602,Լուսին կանչեց,Patricia Briggs,,Mercedes Thompson մականունով Mercy-ը Volkswage...,Սիրավեպ,,,,3.80,254592,7324,,,
6629,Ընտրություն,Նիկոլաս Սփարկս,,1 Նյու Յորք Թայմսի բեսթսելլեր Նիկոլաս Սփարկսը ...,Սիրավեպ,,,,4.04,49514,1730,,,


In [10]:
df['Pages'] = df['Pages'].astype('Int64')
df['Year of Publishing'] = df['Year of Publishing'].astype('Int64')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6640 entries, 0 to 6639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               6640 non-null   object 
 1   Author              6581 non-null   object 
 2   Price               1257 non-null   Int64  
 3   Description         5899 non-null   object 
 4   Genre               6640 non-null   object 
 5   Publisher           315 non-null    object 
 6   Pages               271 non-null    Int64  
 7   Reader              871 non-null    object 
 8   Avg. Rating         2719 non-null   float64
 9   Number of Ratings   2719 non-null   Int64  
 10  Number of reviews   2719 non-null   Int64  
 11  Year of Publishing  332 non-null    Int64  
 12  Language            331 non-null    object 
 13  Age group           329 non-null    object 
dtypes: Int64(5), float64(1), object(8)
memory usage: 758.8+ KB


In [11]:
unique_age_groups = df['Age group'].unique()
print(unique_age_groups)

[nan '16+' '12+' '14+' '8+' '6+' '10+' '18+' '17+']


In [12]:
# Removing the '+' sign from the 'Age group' column and converting it to integer
df['Age group'] = df['Age group'].str.replace('+', '').astype(float).astype('Int64')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6640 entries, 0 to 6639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               6640 non-null   object 
 1   Author              6581 non-null   object 
 2   Price               1257 non-null   Int64  
 3   Description         5899 non-null   object 
 4   Genre               6640 non-null   object 
 5   Publisher           315 non-null    object 
 6   Pages               271 non-null    Int64  
 7   Reader              871 non-null    object 
 8   Avg. Rating         2719 non-null   float64
 9   Number of Ratings   2719 non-null   Int64  
 10  Number of reviews   2719 non-null   Int64  
 11  Year of Publishing  332 non-null    Int64  
 12  Language            331 non-null    object 
 13  Age group           329 non-null    Int64  
dtypes: Int64(6), float64(1), object(7)
memory usage: 765.3+ KB


### Doing word2vec conversion

In [15]:
# Preprocessing text data (tokenization and lowercasing)
text_columns = ['Title', 'Author', 'Description', 'Publisher', 'Reader', 'Language']
for column in text_columns:
    df[column] = df[column].apply(lambda x: word_tokenize(str(x).lower()))

# Training Word2Vec models for each text column
word2vec_models = {}
for column in text_columns:
    word2vec_model = Word2Vec(sentences=df[column], vector_size=1, window=5, min_count=1, workers=4)
    word2vec_models[column] = word2vec_model

# Vectorizing text data for each column
for column, word2vec_model in word2vec_models.items():
    def document_vector(doc):
        vecs = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]
        # If no vectors found (e.g., all words are out-of-vocabulary), return NaNs
        if not vecs:
            return [np.nan] * 1  # Assuming vector size is 100
        return np.mean(vecs, axis=0)  # Average word vectors

    # Creating separate columns for each element of the vector
    vectors = df[column].apply(document_vector).apply(pd.Series)
    vectors.columns = [f"{column}_Vec_{i}" for i in range(1)]  # Assuming vector size is 100
    df = pd.concat([df, vectors], axis=1)

# Dropping the original text columns
df.drop(columns=text_columns, inplace=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6640 entries, 0 to 6639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Price               1257 non-null   Int64  
 1   Genre               6640 non-null   object 
 2   Pages               271 non-null    Int64  
 3   Avg. Rating         2719 non-null   float64
 4   Number of Ratings   2719 non-null   Int64  
 5   Number of reviews   2719 non-null   Int64  
 6   Year of Publishing  332 non-null    Int64  
 7   Age group           329 non-null    Int64  
 8   Title_Vec_0         6640 non-null   float32
 9   Author_Vec_0        6640 non-null   float32
 10  Description_Vec_0   6640 non-null   float32
 11  Publisher_Vec_0     6640 non-null   float32
 12  Reader_Vec_0        6640 non-null   float32
 13  Language_Vec_0      6640 non-null   float32
dtypes: Int64(6), float32(6), float64(1), object(1)
memory usage: 609.7+ KB


In [17]:
df_copy = df.copy()

In [18]:
df.isnull().sum()

Price                 5383
Genre                    0
Pages                 6369
Avg. Rating           3921
Number of Ratings     3921
Number of reviews     3921
Year of Publishing    6308
Age group             6311
Title_Vec_0              0
Author_Vec_0             0
Description_Vec_0        0
Publisher_Vec_0          0
Reader_Vec_0             0
Language_Vec_0           0
dtype: int64

### Experimenting with different models

In [19]:
# Step 1: Handling missing values (Imputation)
# Dropping rows with missing values in the "Genre" column
df_cleaned = df.dropna(subset=['Genre'])

# Imputing missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# Converting the numpy array back to a DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# Step 2: Spliting the data into features (X) and target variable (y)
y = df_cleaned['Genre']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Step 4: Choosing a model
model = RandomForestClassifier()  

# Step 5: Training the model
model.fit(X_train, y_train)

# Step 6: Evaluating the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5903614457831325


In [21]:
# Step 1: Handling missing values (Imputation)
# Dropping rows with missing values in the "Genre" column
# df_cleaned = df.dropna(subset=['Genre'])

# # Imputing missing values with the mean of each column
# imputer = SimpleImputer(strategy='median')
# X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# # Converting the numpy array back to a DataFrame
# X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# # Step 2: Spliting the data into features (X) and target variable (y)
# y = df_cleaned['Genre']

# # Step 3: Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initializing models
models = {
    "BaggingClassifier": BaggingClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    # StackingClassifier and VotingClassifier need estimators
    "StackingClassifier": StackingClassifier(estimators=[
        ('bagging', BaggingClassifier()),
        ('rf', RandomForestClassifier())
    ]),
    "VotingClassifier": VotingClassifier(estimators=[
        # ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist', gpu_id=0)),
        ('bagging', BaggingClassifier()),
        ('rf', RandomForestClassifier())
    ])
}

# Training and evaluating each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")

BaggingClassifier Accuracy: 0.5609939759036144
RandomForestClassifier Accuracy: 0.5881024096385542


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


StackingClassifier Accuracy: 0.6129518072289156
VotingClassifier Accuracy: 0.5572289156626506


In [23]:
# # Step 1: Handling missing values (Imputation)
# df_cleaned = df.dropna(subset=['Genre'])

# # Imputing missing values with the median of each column
# imputer = SimpleImputer(strategy='median')
# X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# # Converting the numpy array back to a DataFrame
# X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# # Step 2: Spliting the data into features (X) and target variable (y)
# y = df_cleaned['Genre']

# Encoding the target labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_encoded, test_size=0.2, random_state=42)

# Building the FFNN model with additional layers, larger neurons, and different activation functions
model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dense(512, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.2),
    BatchNormalization(),
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.2),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compiling the model with a lower learning rate and Adam optimizer
optimizer = Adam(learning_rate=0.01) 
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model for more epochs with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Evaluating the model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=-1)
accuracy = accuracy_score(y_test, y_pred_classes)
print("FFNN Accuracy:", accuracy)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
FFNN Accuracy: 0.4457831325301205


In [27]:
# Defining the FFNN model class
class FFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(FFNN, self).__init__()
        self.linear1 = nn.Linear(input_size, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, 128)
        self.linear4 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = F.relu(self.linear2(x))
        x = self.dropout(x)
        x = F.relu(self.linear3(x))
        x = self.dropout(x)
        x = self.linear4(x)
        return F.softmax(x, dim=1)

# # Step 1: Handling missing values (Imputation)
# df_cleaned = df.dropna(subset=['Genre'])

# # Imputing missing values with the median of each column
# imputer = SimpleImputer(strategy='median')
# X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# # Converting the numpy array back to a DataFrame
# X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# # Step 2: Spliting the data into features (X) and target variable (y)
# y = df_cleaned['Genre']

# # Encoding the target labels using LabelEncoder
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Step 3: Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_encoded, test_size=0.2, random_state=42)

# Converting data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values.astype('float32'))
y_train_tensor = torch.tensor(y_train)
X_test_tensor = torch.tensor(X_test.values.astype('float32'))
y_test_tensor = torch.tensor(y_test)

# Initializing the model
model = FFNN(X_train.shape[1], len(label_encoder.classes_))

# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.02)  

# Training the model with early stopping
epochs = 100
batch_size = 32
best_accuracy = 0.0
patience = 5
counter = 0

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        inputs = X_train_tensor[i:i+batch_size]
        targets = y_train_tensor[i:i+batch_size]

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    # Evaluating the model
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
        print(f"Epoch [{epoch+1}/{epochs}], Validation Accuracy: {accuracy:.4f}")

        # Checking for improvement in accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            counter = 0
        else:
            counter += 1

        # Early stopping if the accuracy doesn't improve for a certain number of epochs
        if counter >= patience:
            print("Early stopping!")
            break

print("Best Validation Accuracy:", best_accuracy)

Epoch [1/100], Validation Accuracy: 0.1596
Epoch [2/100], Validation Accuracy: 0.1604
Epoch [3/100], Validation Accuracy: 0.1596
Epoch [4/100], Validation Accuracy: 0.1596
Epoch [5/100], Validation Accuracy: 0.1589
Epoch [6/100], Validation Accuracy: 0.1596
Epoch [7/100], Validation Accuracy: 0.1596
Early stopping!
Best Validation Accuracy: 0.16039156626506024


In [None]:
# Defining a custom classifier class compatible with scikit-learn
class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, output_size, learning_rate=0.001, dropout_rate=0.2, l2_penalty=0.001, epochs=50, batch_size=32):
        self.input_shape = input_shape
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.dropout_rate = dropout_rate
        self.l2_penalty = l2_penalty
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, X, y):
        self.model = self._create_model()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        return self.model.predict(X).argmax(axis=-1)

    def _create_model(self):
        model = Sequential([
            Dense(1024, activation='relu', input_shape=self.input_shape),
            BatchNormalization(),
            Dropout(self.dropout_rate),
            Dense(512, activation='relu', kernel_regularizer=l2(self.l2_penalty)),
            BatchNormalization(),
            Dropout(self.dropout_rate),
            Dense(256, activation='relu', kernel_regularizer=l2(self.l2_penalty)),
            Dense(self.output_size, activation='softmax')
        ])

        optimizer = Adam(learning_rate=self.learning_rate)
        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

# Step 1: Handling missing values (Imputation)
# df_cleaned = df.dropna(subset=['Genre'])

# # Imputing missing values with the median of each column
# imputer = SimpleImputer(strategy='median')
# X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# # Converting the numpy array back to a DataFrame
# X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# # Step 2: Spliting the data into features (X) and target variable (y)
# y = df_cleaned['Genre']

# # Encoding the target labels using LabelEncoder
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Step 3: Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_encoded, test_size=0.2, random_state=42)

# Defining input shape and output size
input_shape = (X_train.shape[1],)
output_size = len(label_encoder.classes_)

# Defining hyperparameters to tune
param_grid = {
    'learning_rate': [0.001, 0.0001],
    'dropout_rate': [0.2, 0.3],
    'l2_penalty': [0.001, 0.0001],
    'epochs': [50, 100],
    'batch_size': [16, 32]
}

# Performing random search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=KerasClassifierWrapper(input_shape, output_size), param_distributions=param_grid, n_iter=10, cv=3, verbose=2)
random_search_result = random_search.fit(X_train, y_train)

# Printing best parameters and accuracy
print("Best Parameters:", random_search_result.best_params_)
print("Best Accuracy:", random_search_result.best_score_)

# Evaluating the best model
best_model = random_search_result.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Best Model Accuracy:", accuracy)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




[CV] END batch_size=32, dropout_rate=0.3, epochs=100, l2_penalty=0.001, learning_rate=0.001; total time= 2.4min
[CV] END batch_size=32, dropout_rate=0.3, epochs=100, l2_penalty=0.001, learning_rate=0.001; total time= 2.3min
[CV] END batch_size=32, dropout_rate=0.3, epochs=100, l2_penalty=0.001, learning_rate=0.001; total time= 2.4min
[CV] END batch_size=32, dropout_rate=0.2, epochs=50, l2_penalty=0.0001, learning_rate=0.0001; total time= 1.1min
[CV] END batch_size=32, dropout_rate=0.2, epochs=50, l2_penalty=0.0001, learning_rate=0.0001; total time= 1.4min
[CV] END batch_size=32, dropout_rate=0.2, epochs=50, l2_penalty=0.0001, learning_rate=0.0001; total time= 1.4min
[CV] END batch_size=16, dropout_rate=0.2, epochs=50, l2_penalty=0.0001, learning_rate=0.0001; total time= 2.4min
[CV] END batch_size=16, dropout_rate=0.2, epochs=50, l2_penalty=0.0001, learning_rate=0.0001; total time= 1.8min
[CV] END batch_size=16, dropout_rate=0.2, epochs=50, l2_penalty=0.0001, learning_rate=0.0001; total

In [28]:
# Step 1: Handling missing values (Imputation)
# Dropping rows with missing values in the "Genre" column
# df_cleaned = df.dropna(subset=['Genre'])

# # Imputing missing values with the mean of each column
# imputer = SimpleImputer(strategy='mean')
# X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# # Converting the numpy array back to a DataFrame
# X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# # Step 2: Spliting the data into features (X) and target variable (y)
# y = df_cleaned['Genre']

# # Step 3: Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.15, random_state=42)

# Step 4: Choosing a model
model = LogisticRegression(max_iter=1000)  

# Step 5: Training the model
model.fit(X_train, y_train)

# Step 6: Evaluating the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.34839357429718876


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Step 1: Handling missing values (Imputation)
print("Handling missing values...")
df_cleaned = df.dropna(subset=['Genre'])

# Imputing missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df_cleaned.drop(columns=['Genre']))

# Converting the numpy array back to a DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=df_cleaned.drop(columns=['Genre']).columns)

# Step 2: Spliting the data into features (X) and target variable (y)
print("Splitting data into features and target variable...")
X = X_imputed  # Features
y = df_cleaned['Genre']  # Target variable

# Encoding the target labels using LabelEncoder
print("Encoding target labels...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 3: Spliting the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Handling data imbalance using oversampling
print("Handling data imbalance using oversampling...")
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Step 5: Training the Logistic Regression model
print("Training the Logistic Regression model...")
model = LogisticRegression(max_iter=200)
model.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluating the model
print("Evaluating the model...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Handling missing values...
Splitting data into features and target variable...
Encoding target labels...
Splitting data into training and testing sets...
Handling data imbalance using oversampling...
Training the Logistic Regression model...
Evaluating the model...
Accuracy: 0.08885542168674698


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Armenian word2vec transformation part

In [None]:
!pip install gitpython
!pip install catboost

In [None]:
import git
import os
import joblib
import pickle
import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_excel("merged-armenian-books-dataset.xlsx")

In [None]:
# URL of the GitHub repository
repo_url = "https://github.com/YerevaNN/word2vec-armenian-wiki.git"

# Directory where it will clone the repository
clone_dir = "./word2vec-armenian-wiki"

# Cloning the repository
git.Repo.clone_from(repo_url, clone_dir)

print("Repository cloned successfully!")

In [2]:
%cd /content/word2vec-armenian-wiki

In [None]:
package_dir = "/content/word2vec-armenian-wiki"
contents = os.listdir(package_dir)
print(contents)

In [None]:
analogies_dir = os.path.join(package_dir, 'analogies')
analogies_contents = os.listdir(analogies_dir)
print(analogies_contents)

In [None]:
directories = ['plotfiles', 'analogies', package_dir]
for directory in directories:
    print(f"Contents of {directory}:")
    contents = os.listdir(os.path.join(package_dir, directory))
    print(contents)
    print()

In [None]:
# Loading the pretrained Word2Vec model
model_path = "/content/word2vec-armenian-wiki/plotfiles/s100w8n5a0.025i7skipgram.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
# Defining function to convert tokens to word vectors using the loaded model
def tokens_to_vectors(tokens):
    vectors = []
    for token in tokens:
        if token in model:
            vectors.append(model[token])
    return vectors

# Defining the text columns to transform
text_columns = ['Title', 'Author', 'Description', 'Publisher', 'Reader', 'Language']

# Iterating over text columns, tokenize, converting tokens to word vectors, and aggregating word vectors
for column in text_columns:
    # Tokenizing text data
    df[column + '_tokens'] = df[column].apply(lambda x: word_tokenize(str(x)))
    # Converting tokens to word vectors
    df[column + '_vectors'] = df[column + '_tokens'].apply(tokens_to_vectors)
    # Aggregating word vectors
    df[column + '_vector'] = df[column + '_vectors'].apply(lambda vectors: np.mean(vectors, axis=0) if vectors else np.nan)
    # Dropping original text columns
    df.drop([column, column + '_tokens', column + '_vectors'], axis=1, inplace=True)

df.head(100)

In [4]:
df.info()

In [None]:
genre_counts = df['Genre'].value_counts()
print(genre_counts)

In [None]:
df_new = df.copy()

In [None]:
# Dropping rows with null target values
df.dropna(subset=['Genre'], inplace=True)

# Encoding the target variable (Genre)
label_encoder = LabelEncoder()
df['Genre_encoded'] = label_encoder.fit_transform(df['Genre'])

# Defining features and target variable
vector_columns = ['Title_vector', 'Author_vector', 'Description_vector', 'Publisher_vector', 'Reader_vector', 'Language_vector']
features = df[vector_columns]
target = df['Genre_encoded']

# Spliting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Defining the CatBoost model
model = CatBoostClassifier()

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculating multiclass AUC score
multiclass_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovo')
print("Multiclass AUC Score:", multiclass_auc)

### The same code as above except the data is based on top 15 genres

In [None]:
# Determining top 15 genres
top15_genres = df['Genre'].value_counts().nlargest(15).index.tolist()

# Filtering dataset to include only top 15 genres
df_top15 = df[df['Genre'].isin(top15_genres)].copy()

# Dropping rows with null target values (Genre)
df.dropna(subset=['Genre'], inplace=True)

# Spliting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Converting aggregated word vectors to strings
vector_columns = ['Title_vector', 'Author_vector', 'Description_vector', 'Publisher_vector', 'Reader_vector', 'Language_vector']

# Encoding the target variable (Genre)
label_encoder = LabelEncoder()
df_top15['Genre_encoded'] = label_encoder.fit_transform(df_top15['Genre'])

# Defining features and target variable
features = df_top15[vector_columns]
target = df_top15['Genre_encoded']

# Spliting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Defining the CatBoost model
model = CatBoostClassifier()

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculating multiclass AUC score
multiclass_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovo')
print("Multiclass AUC Score:", multiclass_auc)

In [None]:
# Determining top 15 genres
top15_genres = df_new['Genre'].value_counts().nlargest(15).index.tolist()

# Filtering dataset to include only top 15 genres
df_top15 = df_new[df_new['Genre'].isin(top15_genres)].copy()

# Converting text columns to strings
text_columns = ['Title_vector', 'Description_vector', 'Author_vector', 'Publisher_vector']
for col in text_columns:
    df_top15[col] = df_top15[col].fillna('').astype(str)

# Combining all relevant text columns into a single text column
df_top15['Text'] = df_top15[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Applying Tf-idf encoding
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_top15['Text'])

# Saving the TfidfVectorizer
with open("tfidf_vectorizer-arm.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

# Spliting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df_top15['Genre'], test_size=0.2, random_state=25)

# Defining CatBoostClassifier with GPU acceleration
catboost_classifier = CatBoostClassifier(
    iterations=2000,  # Decreased iterations for faster training
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    random_strength=1,
    border_count=32,
    eval_metric='AUC',
    task_type='GPU',  # Enable GPU acceleration
    verbose=0
)

# Training the model with validation set
catboost_classifier.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose_eval=False
)

# Saving the trained model
joblib.dump(catboost_classifier, "catboost-model-armword2vec.pkl")

# Making predictions
catboost_pred = catboost_classifier.predict(X_test)

# Evaluating accuracy
accuracy = accuracy_score(y_test, catboost_pred)
print("CatBoostClassifier Accuracy:", accuracy)

# Determining top 5 classes
top5_classes = df_top15['Genre'].value_counts().nlargest(5).index.tolist()
print("Top 5 Classes:", top5_classes)

# Evaluating AUC
auc = roc_auc_score(y_test, catboost_classifier.predict_proba(X_test), multi_class='ovo')
print("CatBoostClassifier AUC:", auc)