# Preprocessing

In [1]:
import os
import bestreads.text as text
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('./data/goodreads_books.csv')
data_train = data.sample(frac = 0.9, random_state = 111)
data_test = data.drop(data_train.index)

## Train-Test Set Splits

In [2]:
save_dir = './data/processed/'
os.makedirs(save_dir, exist_ok=True)
data_train.to_csv(save_dir + 'goodreads_books_train.csv', index = False)
data_test.to_csv(save_dir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [3]:
is_english_description_train = text.is_english(data_train['description'])
english_descriptions_train = data_train.description[is_english_description_train]
cleaned_descriptions_train = text.clean_text(english_descriptions_train)
is_english_description_train = is_english_description_train.rename('is_english_description',)
cleaned_descriptions_train = cleaned_descriptions_train.rename('cleaned_descriptions')

is_english_description_test = text.is_english(data_test['description'])
english_descriptions_test = data_test.description[is_english_description_test]
cleaned_descriptions_test = text.clean_text(english_descriptions_test)
is_english_description_test = is_english_description_test.rename('is_english_description')
cleaned_descriptions_test = cleaned_descriptions_test.rename('cleaned_descriptions')

## Grouping Descriptions by Genre

In [4]:
genre_and_votes_train = text.get_genres(data_train.genre_and_votes[is_english_description_train])
genre_and_votes_test = text.get_genres(data_test.genre_and_votes[is_english_description_test])
combined = text.combine_genres(genre_and_votes_train.genre_1, cleaned_descriptions_train)

100%|██████████| 39524/39524 [00:00<00:00, 41267.51it/s]
100%|██████████| 4407/4407 [00:00<00:00, 38436.40it/s]


## Calculating TF-IDF

In [5]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(save_dir + 'tf_idf_table_train.csv', index_label='word')
tf_idf_table_train.head()

100%|██████████| 186/186 [00:52<00:00,  3.52it/s]


Unnamed: 0,Historical,Textbooks,Alcohol,Pop Culture,Menage,History,Feminism,Marriage,Self Help,Academic,...,Humanities,Parenting,Fantasy,Cultural,Sports,Fairies,Harlequin,Buisness,Biography,African Literature
brilliant,0.000468,0.0,0.0,0.0,0.0,0.00069,0.000421,0.0,0.000194,0.0,...,0.0,0.0,0.000206,0.000451,0.000174,0.0,0.0,0.0,0.000654,0.0
render,0.000194,0.0,0.0,0.0,0.0,0.00017,0.0,0.0,0.0,0.0,...,0.0,0.0,5.6e-05,6.6e-05,9.4e-05,0.0,0.0,0.0,0.00023,0.0
scandal,0.000556,0.0,0.0,0.0,0.0,0.000228,0.000294,0.0,0.0,0.0,...,0.0,0.0,2.9e-05,0.000114,0.000565,0.0,0.0,0.0,0.000357,0.0
histor,0.001691,0.0,0.0,0.0,0.0,0.001918,0.00091,0.0,0.0,0.005156,...,0.0,0.000127,7.9e-05,0.000814,0.0002,0.0,0.0,0.0,0.000393,0.0
figur,0.000382,0.0,0.0,0.0,0.0,0.000806,0.000179,0.0,0.000276,0.0,...,0.0,0.0,0.000324,0.000419,0.000443,0.0,0.0,0.0,0.001645,0.0


In [6]:
# Save the training data
processed_data_train = (is_english_description_train.to_frame()
                        .merge(cleaned_descriptions_train,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_train,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(save_dir + 'goodreads_books_train_processed.csv', 
                            index_label='index')

# Save the test data
processed_data_test = (is_english_description_test.to_frame()
                        .merge(cleaned_descriptions_test,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_test,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_test.to_csv(save_dir + 'goodreads_books_test_processed.csv', 
                           index_label='index')