# Preprocessing

In [1]:
import os
import bestreads.text as text
import pandas as pd

data = pd.read_csv('./data/goodreads_books.csv')
data_train = data.sample(frac = 0.9, random_state = 111)
data_test = data.drop(data_train.index)

## Train-Test Set Splits

In [2]:
save_dir = './data/processed/'
os.makedirs(save_dir, exist_ok=True)
data_train.to_csv(save_dir + 'goodreads_books_train.csv', index = False)
data_test.to_csv(save_dir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [3]:
is_english_description_train = text.is_english(data_train['description'])
english_descriptions_train = data_train.description[is_english_description_train]
cleaned_descriptions_train = text.clean_text(english_descriptions_train)
is_english_description_train = is_english_description_train.rename('is_english_description',)
cleaned_descriptions_train = cleaned_descriptions_train.rename('cleaned_descriptions')

is_english_description_test = text.is_english(data_test['description'])
english_descriptions_test = data_test.description[is_english_description_test]
cleaned_descriptions_test = text.clean_text(english_descriptions_test)
is_english_description_test = is_english_description_test.rename('is_english_description')
cleaned_descriptions_test = cleaned_descriptions_test.rename('cleaned_descriptions')

## Grouping Descriptions by Genre

In [4]:
genre_and_votes_train = text.get_genres(data_train.genre_and_votes[is_english_description_train])
genre_and_votes_test = text.get_genres(data_test.genre_and_votes[is_english_description_test])
combined = text.combine_genres(genre_and_votes_train.genre_1, cleaned_descriptions_train)

100%|████████████████████████████████████████████████| 39528/39528 [00:02<00:00, 16213.76it/s]
100%|██████████████████████████████████████████████████| 4409/4409 [00:00<00:00, 15330.36it/s]


## Calculating TF-IDF

In [5]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(save_dir + 'tf_idf_table_train.csv', index_label='word')
tf_idf_table_train.head()

100%|█████████████████████████████████████████████████████████| 60/60 [00:14<00:00,  4.13it/s]


Unnamed: 0,Religion,Christian,Westerns,Historical,Animals,Spirituality,Psychology,Science Fiction,Plays,Realistic Fiction,...,Adult Fiction,Travel,Philosophy,Poetry,History,Adventure,Paranormal,Thriller,Mystery,Sequential Art
much,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
public,3.6e-05,1.8e-05,1.5e-05,1.3e-05,1.2e-05,1.4e-05,7e-05,1e-05,3.7e-05,5.1e-05,...,3.3e-05,2.5e-05,7.4e-05,3.3e-05,4.2e-05,1.4e-05,0.0,1.4e-05,1.2e-05,1.7e-05
debat,0.000323,0.0,0.0,1.5e-05,0.0,0.0,0.000258,1.2e-05,0.0,0.0,...,0.0,6.2e-05,0.000566,7.4e-05,0.000199,0.0,0.0,0.0,7e-06,6e-06
unit,8.4e-05,2.9e-05,0.0,0.000105,5.6e-05,0.0,0.0001,0.000148,7e-05,0.0,...,0.0,0.000194,2.7e-05,6.3e-05,0.000365,0.0001,1.6e-05,0.000233,6.4e-05,6.9e-05
state,9.8e-05,2.5e-05,0.0,6.7e-05,3.3e-05,1.9e-05,0.000206,8.9e-05,0.000103,0.0,...,0.0,0.000142,0.000145,6.4e-05,0.000316,0.000118,9e-06,0.000151,5.6e-05,4.7e-05


In [6]:
# Save the training data
processed_data_train = (is_english_description_train.to_frame()
                        .merge(cleaned_descriptions_train,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_train,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(save_dir + 'goodreads_books_train_processed.csv', 
                            index_label='index')

# Save the test data
processed_data_test = (is_english_description_test.to_frame()
                        .merge(cleaned_descriptions_test,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_test,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_test.to_csv(save_dir + 'goodreads_books_test_processed.csv', 
                           index_label='index')