In [None]:
# !pip install pandas openpyxl langdetect googletrans==4.0.0-rc1
# !pip install langdetect
# !pip install googletrans
# !pip install langid
# !pip install deep_translator
# !pip install translate
# !python -m spacy download en_core_web_md

In [3]:
import pandas as pd
import re
import ast
import spacy
import time
import langid
from langdetect import detect
from googletrans import Translator
from deep_translator import GoogleTranslator, MyMemoryTranslator
from sklearn.metrics.pairwise import cosine_similarity

### English-Russian books in overall dataset

In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Remove special characters and symbols except ':', and remove '#'
    cleaned_text = re.sub(r'[^\w\s:]', '', text)
    return cleaned_text.strip()

# Function to translate text to Armenian with retry logic
def translate_to_armenian(text, max_retries=3):
    translator = Translator()
    num_retries = 0
    while num_retries < max_retries:
        try:
            if pd.notnull(text) and text.strip():
                # Preprocessing the text
                cleaned_text = preprocess_text(text)
                # Detecting language
                lang = detect(cleaned_text)
                # Translating to Armenian if the detected language is English or Russian
                if lang in ['en', 'ru']:
                    translation = translator.translate(cleaned_text, src=lang, dest='hy')
                    return translation.text
                else:
                    return text
            else:
                return text
        except Exception as e:
            print(f"Translation failed for '{text}': {e}")
            num_retries += 1
            time.sleep(2)  # Waiting for 2 seconds before retrying
    return text  # Returning original text if max retries exceeded

# Loading the Excel file
excel_path = 'armenian-books-data-translate.xlsx'
data = pd.read_excel(excel_path)

# Translating the 'Title', 'Author', 'Description' columns to Armenian
data['Title'] = data['Title'].apply(lambda x: translate_to_armenian(x, max_retries=3))
data['Author'] = data['Author'].apply(lambda x: translate_to_armenian(x, max_retries=3))
data['Description'] = data['Description'].apply(lambda x: translate_to_armenian(x, max_retries=3))

# Saving the translated data to a new Excel file
output_excel_path = 'translated-books.xlsx'
data.to_excel(output_excel_path, index=False)

Translation failed for '(Չ)Վեպ': No features in text.
Translation failed for '(Չ)Վեպ': No features in text.
Translation failed for '(Չ)Վեպ': No features in text.
Translation failed for '1984': No features in text.
Translation failed for '1984': No features in text.
Translation failed for '1984': No features in text.
Translation failed for '987': No features in text.
Translation failed for '987': No features in text.
Translation failed for '987': No features in text.
Translation failed for '987': No features in text.
Translation failed for '987': No features in text.
Translation failed for '987': No features in text.
Translation failed for 'Армянское счастье, или Павликянк. Том 1': The read operation timed out
Translation failed for 'Ջորջ Օրուել': No features in text.
Translation failed for 'Ջորջ Օրուել': No features in text.
Translation failed for 'Ջորջ Օրուել': No features in text.
Translation failed for 'Օրուել Ջ.': No features in text.
Translation failed for 'Օրուել Ջ.': No features

### Defining paths to goodreads scraped datasets from genres : horror, romance, fantasy and mystery

In [None]:
excel_path_mystery = 'goodreads-mystery.xlsx'
excel_path_romance = 'goodreads-romance.xlsx'
excel_path_horror = 'goodreads-horror.xlsx'
excel_path_fantasy = 'goodreads-fantasy.xlsx'

### Goodreads mystery books data translation

In [None]:
# Loading the Excel file and converting specific columns to string
data_cleaned = pd.read_excel(excel_path_mystery, dtype={'Title': str, 'Author': str, 'Description': str})

# Translating the 'Title', 'Author', 'Description' columns to Armenian
data_cleaned['Title'] = data_cleaned['Title'].apply(lambda x: translate_to_armenian(x, max_retries=2))
data_cleaned['Author'] = data_cleaned['Author'].apply(lambda x: translate_to_armenian(x, max_retries=2))
data_cleaned['Description'] = data_cleaned['Description'].apply(lambda x: translate_to_armenian(x, max_retries=2))

# Saving the translated data to a new Excel file
output_excel_path = 'translated-mystery-books.xlsx'
data_cleaned.to_excel(output_excel_path, index=False)

### Goodreads romance books data translation in two steps

In [None]:
# Loading the Excel file and converting specific columns to string
data_cleaned = pd.read_excel(excel_path_romance, dtype={'Title': str, 'Author': str, 'Description': str})

# Removing content within brackets in the 'Title' column
data_cleaned['Title'] = data_cleaned['Title'].str.replace(r'\([^()]*\)', '', regex=True)

# Translating the 'Title', 'Author', 'Description' columns to Armenian
data_cleaned['Title'] = data_cleaned['Title'].apply(lambda x: translate_to_armenian(x, max_retries=2))
data_cleaned['Author'] = data_cleaned['Author'].apply(lambda x: translate_to_armenian(x, max_retries=2))
data_cleaned['Description'] = data_cleaned['Description'].apply(lambda x: translate_to_armenian(x, max_retries=2))

# Saving the translated data to a new Excel file
output_excel_path = 'translated-romance-books.xlsx'
data_cleaned.to_excel(output_excel_path, index=False)

In [None]:
# Function to preprocess text and remove numbers and symbols
def preprocess_text_new(text):
    # Removing numbers
    text_without_numbers = re.sub(r'\d+', '', text)
    # Removing special characters and symbols except ':', and remove '#'
    cleaned_text = re.sub(r'[^\w\s:]', '', text_without_numbers)
    return cleaned_text.strip()

# Function to translate text to Armenian with retry logic
def translate_to_armenian_batch(text_list, max_retries=2, batch_size=10):
    num_retries = 0
    translations = []
    while num_retries < max_retries:
        try:
            if text_list:
                # If Google Translate fails, using MyMemoryTranslator as a backup
                translator = MyMemoryTranslator(source='english', target='armenian')
                # Batch translation requests to avoid making too many requests
                for i in range(0, len(text_list), batch_size):
                    batch_text = text_list[i:i+batch_size]
                    batch_translations = translator.translate_batch(batch_text)
                    translations.extend(batch_translations)
                return translations, True  # Returning translations and a flag indicating success
            else:
                return text_list, False  # Returning original text and failure flag
        except Exception as e:
            print(f"MyMemory Translate batch failed: {e}")
            num_retries += 1
            time.sleep(2)  # Waiting for 2 seconds before retrying
    return text_list, False  # Returning original text and failure flag if max retries exceeded

# Loading the Excel file and convert specific columns to string
excel_path = 'translated-romance-books.xlsx'
data_cleaned = pd.read_excel(excel_path, dtype={'Title': str, 'Author': str, 'Description': str})

# Preprocessing and removing numbers and symbols from the 'Title' column
data_cleaned['Title'] = data_cleaned['Title'].apply(lambda x: preprocess_text_new(x))

# Tracking the count of successful translations
success_count = 0

# Batch size for translations
batch_size = 100

# Translating the 'Title' column to Armenian using MyMemoryTranslator
titles_to_translate = data_cleaned['Title'].tolist()
num_titles = len(titles_to_translate)
start_index = 0
while start_index < num_titles:
    end_index = min(start_index + batch_size, num_titles)
    batch_titles = titles_to_translate[start_index:end_index]
    translations, success = translate_to_armenian_batch(batch_titles, max_retries=2, batch_size=batch_size)
    if success:
        for i, translation in enumerate(translations):
            data_cleaned.at[start_index + i, 'Title'] = translation
        success_count += len(translations)
    print(f"Translated {success_count} out of {num_titles} titles successfully.")
    print(f"Waiting for 1 second before next translation...")
    time.sleep(1)
    start_index = end_index

# Saving the translated data to a new Excel file
output_excel_path = 'translated-romance-books-new.xlsx.xlsx'
data_cleaned.to_excel(output_excel_path, index=False)

print("All translations completed.")

Translated 100 out of 644 titles successfully.
Waiting for 1 second before next translation...
Translated 200 out of 644 titles successfully.
Waiting for 1 second before next translation...
Translated 300 out of 644 titles successfully.
Waiting for 1 second before next translation...
Translated 400 out of 644 titles successfully.
Waiting for 1 second before next translation...
Translated 500 out of 644 titles successfully.
Waiting for 1 second before next translation...
Translated 600 out of 644 titles successfully.
Waiting for 1 second before next translation...
Translated 644 out of 644 titles successfully.
Waiting for 1 second before next translation...
All translations completed.


### Goodreads horror books translation

In [None]:
# Loading the Excel file and convert specific columns to string
data_cleaned = pd.read_excel(excel_path_horror, dtype={'Title': str, 'Author': str, 'Description': str})

# Removing content within brackets in the 'Title' column
data_cleaned['Title'] = data_cleaned['Title'].str.replace(r'\([^()]*\)', '', regex=True)

# Translating the 'Title', 'Author', 'Description' columns to Armenian
data_cleaned['Title'] = data_cleaned['Title'].apply(lambda x: translate_to_armenian(x, max_retries=2))
data_cleaned['Author'] = data_cleaned['Author'].apply(lambda x: translate_to_armenian(x, max_retries=2))
data_cleaned['Description'] = data_cleaned['Description'].apply(lambda x: translate_to_armenian(x, max_retries=2))

# Saving the translated data to a new Excel file
output_excel_path = 'translated-horror-books.xlsx'
data_cleaned.to_excel(output_excel_path, index=False)

### Goodreads fantasy books translation in two steps

In [None]:
# Loading the Excel file and convert specific columns to string
excel_path = 'translated-fantasy-books.xlsx'
data_cleaned = pd.read_excel(excel_path, dtype={'Title': str, 'Author': str, 'Description': str})

# Preprocessing and removing numbers and symbols from the 'Title' column
data_cleaned['Title'] = data_cleaned['Title'].apply(lambda x: preprocess_text_new(x))

# Tracking the count of successful translations
success_count = 0

# Batch size for translations
batch_size = 100

# Translating the 'Title' column to Armenian using MyMemoryTranslator
titles_to_translate = data_cleaned['Title'].tolist()
num_titles = len(titles_to_translate)
start_index = 0
while start_index < num_titles:
    end_index = min(start_index + batch_size, num_titles)
    batch_titles = titles_to_translate[start_index:end_index]
    translations, success = translate_to_armenian_batch(batch_titles, max_retries=2, batch_size=batch_size)
    if success:
        for i, translation in enumerate(translations):
            data_cleaned.at[start_index + i, 'Title'] = translation
        success_count += len(translations)
    print(f"Translated {success_count} out of {num_titles} titles successfully.")
    print(f"Waiting for 1 second before next translation...")
    time.sleep(1)
    start_index = end_index

# Saving the translated data to a new Excel file
output_excel_path = 'translated-fantasy-books-new.xlsx'
data_cleaned.to_excel(output_excel_path, index=False)

print("All translations completed.")

## Cosine Similarity

#### With 0.9 threshold

In [None]:
# Loading the data from the Excel file
excel_path = 'goodreads_data_with_additional_info.xlsx'
data = pd.read_excel(excel_path)

# Function to preprocess and clean text using SpaCy
nlp = spacy.load("en_core_web_md")

def clean_text_with_spacy(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

# List of popular book genres
popular_genres = ['Mystery', 'Thriller', 'Romance', 'Science Fiction', 'Fantasy', 'Historical Fiction',
                  'Horror', 'Crime', 'Biography', 'Memoir', 'Self-Help', 'Young Adult', "Children's",
                  'Adventure', 'Contemporary', 'Dystopian', 'Suspense', 'Poetry', 'Comedy', 'Drama', 'Action',
                  'Paranormal', 'Urban Fantasy', 'Science Fantasy', 'Humor', 'Travel', 'Philosophy', 'Classic']

# Preprocess and clean popular genres
cleaned_popular_genres = [clean_text_with_spacy(genre) for genre in popular_genres]

# Creating an empty list to store top labels
top_labels = []

# Iterating over each row in the DataFrame
for index, row in data.iterrows():
    # Extracting labels from the 'Labels' column
    labels = ast.literal_eval(row['Labels'])

    # Removing empty strings from the list
    labels = [label for label in labels if label]

    # Preprocessing and cleaning labels
    cleaned_labels = [clean_text_with_spacy(label) for label in labels]

    # Combining both lists for vectorization
    combined_list = cleaned_labels + cleaned_popular_genres

    # Calculating word vectors for combined list
    vectors = [nlp(text).vector for text in combined_list]

    # Calculating cosine similarity
    cosine_sim = cosine_similarity(vectors)

    # Threshold for similarity
    threshold = 0.9

    # Filtering labels based on cosine similarity
    filtered_indices = [i for i, label in enumerate(labels) if any(cosine_sim[i, j] > threshold for j in range(len(cleaned_labels), len(combined_list)))]

    # Calculating maximum similarity score for each label with popular genres
    max_similarities = [max(cosine_sim[i, len(cleaned_labels):]) for i in filtered_indices]

    # Checking if any labels passed the similarity threshold
    if filtered_indices:
        # Getting the index of the label with the highest similarity score
        top_label_index = filtered_indices[max_similarities.index(max(max_similarities))]

        # Retrieving the top label
        top_label = labels[top_label_index]

        # Appending the top label to the list
        top_labels.append(top_label)
    else:
        # If no label met the similarity threshold, appending None
        top_labels.append(None)

    # Printing progress update
    print(f"Processed {index + 1} books out of {len(data)}")

# Adding the top labels to the DataFrame
data['Top Label'] = top_labels

# Saving the updated DataFrame to a new Excel file
data.to_excel('goodreads_dataset-final-version.xlsx', index=False)

Processed 1 books out of 882
Processed 2 books out of 882
Processed 3 books out of 882
Processed 4 books out of 882
Processed 5 books out of 882
Processed 6 books out of 882
Processed 7 books out of 882
Processed 8 books out of 882
Processed 9 books out of 882
Processed 10 books out of 882
Processed 11 books out of 882
Processed 12 books out of 882
Processed 13 books out of 882
Processed 14 books out of 882
Processed 15 books out of 882
Processed 16 books out of 882
Processed 17 books out of 882
Processed 18 books out of 882
Processed 19 books out of 882
Processed 20 books out of 882
Processed 21 books out of 882
Processed 22 books out of 882
Processed 23 books out of 882
Processed 24 books out of 882
Processed 25 books out of 882
Processed 26 books out of 882
Processed 27 books out of 882
Processed 28 books out of 882
Processed 29 books out of 882
Processed 30 books out of 882
Processed 31 books out of 882
Processed 32 books out of 882
Processed 33 books out of 882
Processed 34 books 

#### Same with 0.75 threshold

In [None]:
# Loading the data from the Excel file
excel_path = 'goodreads_data_with_additional_info.xlsx'
data = pd.read_excel(excel_path)

# Function to preprocess and clean text using SpaCy
nlp = spacy.load("en_core_web_md")

def clean_text_with_spacy(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

# List of popular book genres
popular_genres = ['Mystery', 'Thriller', 'Romance', 'Science Fiction', 'Fantasy', 'Historical Fiction',
                  'Horror', 'Crime', 'Biography', 'Memoir', 'Self-Help', 'Young Adult', "Children's",
                  'Adventure', 'Contemporary', 'Dystopian', 'Suspense', 'Poetry', 'Comedy', 'Drama', 'Action',
                  'Paranormal', 'Urban Fantasy', 'Science Fantasy', 'Humor', 'Travel', 'Philosophy', 'Classic']

# Preprocessing and cleaning popular genres
cleaned_popular_genres = [clean_text_with_spacy(genre) for genre in popular_genres]

# Creating an empty list to store top labels
top_labels = []

# Iterating over each row in the DataFrame
for index, row in data.iterrows():
    # Extracting labels from the 'Labels' column
    labels = ast.literal_eval(row['Labels'])

    # Removing empty strings from the list
    labels = [label for label in labels if label]

    # Preprocessing and clean labels
    cleaned_labels = [clean_text_with_spacy(label) for label in labels]

    # Combining both lists for vectorization
    combined_list = cleaned_labels + cleaned_popular_genres

    # Calculating word vectors for combined list
    vectors = [nlp(text).vector for text in combined_list]

    # Calculating cosine similarity
    cosine_sim = cosine_similarity(vectors)

    # Threshold for similarity
    threshold = 0.75

    # Filtering labels based on cosine similarity
    filtered_indices = [i for i, label in enumerate(labels) if any(cosine_sim[i, j] > threshold for j in range(len(cleaned_labels), len(combined_list)))]

    # Calculating maximum similarity score for each label with popular genres
    max_similarities = [max(cosine_sim[i, len(cleaned_labels):]) for i in filtered_indices]

    # Checking if any labels passed the similarity threshold
    if filtered_indices:
        # Getting the index of the label with the highest similarity score
        top_label_index = filtered_indices[max_similarities.index(max(max_similarities))]

        # Retrieving the top label
        top_label = labels[top_label_index]

        # Appending the top label to the list
        top_labels.append(top_label)
    else:
        # If no label met the similarity threshold, append None
        top_labels.append(None)

    # Printing progress update
    print(f"Processed {index + 1} books out of {len(data)}")

# Adding the top labels to the DataFrame
data['Top Label'] = top_labels

# Saving the updated DataFrame to a new Excel file
data.to_excel('goodreads_dataset-final-version-lower-sim.xlsx', index=False)



Processed 1 books out of 882
Processed 2 books out of 882
Processed 3 books out of 882
Processed 4 books out of 882
Processed 5 books out of 882
Processed 6 books out of 882
Processed 7 books out of 882
Processed 8 books out of 882
Processed 9 books out of 882
Processed 10 books out of 882
Processed 11 books out of 882
Processed 12 books out of 882
Processed 13 books out of 882
Processed 14 books out of 882
Processed 15 books out of 882
Processed 16 books out of 882
Processed 17 books out of 882
Processed 18 books out of 882
Processed 19 books out of 882
Processed 20 books out of 882
Processed 21 books out of 882
Processed 22 books out of 882
Processed 23 books out of 882
Processed 24 books out of 882
Processed 25 books out of 882
Processed 26 books out of 882
Processed 27 books out of 882
Processed 28 books out of 882
Processed 29 books out of 882
Processed 30 books out of 882
Processed 31 books out of 882
Processed 32 books out of 882
Processed 33 books out of 882
Processed 34 books 