# Preprocessing

In [1]:
import os
import bestreads.text as text
import pandas as pd

data = pd.read_csv('./data/goodreads_books.csv')
data_test = data.sample(frac = 0.2, random_state = 111)
data_train = data.drop(data_test.index)

data_val = data_train.sample(frac=0.2, random_state=112)
data_train = data_train.drop(data_val.index)

## Train-Test Set Splits

In [2]:
save_dir = './data/processed/'
os.makedirs(save_dir, exist_ok=True)
data_train.to_csv(save_dir + 'goodreads_books_train.csv', index = False)
data_val.to_csv(save_dir + 'goodreads_books_val.csv', index=False)
data_test.to_csv(save_dir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_val.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [3]:
is_english_description_train = text.is_english(data_train['description'])
english_descriptions_train = data_train.description[is_english_description_train]
cleaned_descriptions_train = text.clean_text(english_descriptions_train)
is_english_description_train = is_english_description_train.rename('is_english_description',)
cleaned_descriptions_train = cleaned_descriptions_train.rename('cleaned_descriptions')

is_english_description_val = text.is_english(data_val['description'])
english_descriptions_val = data_val.description[is_english_description_val]
cleaned_descriptions_val = text.clean_text(english_descriptions_val)
is_english_description_val = is_english_description_val.rename('is_english_description',)
cleaned_descriptions_val = cleaned_descriptions_val.rename('cleaned_descriptions')

is_english_description_test = text.is_english(data_test['description'])
english_descriptions_test = data_test.description[is_english_description_test]
cleaned_descriptions_test = text.clean_text(english_descriptions_test)
is_english_description_test = is_english_description_test.rename('is_english_description')
cleaned_descriptions_test = cleaned_descriptions_test.rename('cleaned_descriptions')

## Grouping Descriptions by Genre

In [4]:
genre_and_votes_train = text.get_genres(data_train.genre_and_votes[is_english_description_train])
genre_and_votes_val = text.get_genres(data_val.genre_and_votes[is_english_description_val], n=20)
genre_and_votes_test = text.get_genres(data_test.genre_and_votes[is_english_description_test], n=20)
combined = text.combine_genres(genre_and_votes_train.genre_1, cleaned_descriptions_train)

100%|██████████████████████████████████| 28153/28153 [00:01<00:00, 19167.41it/s]
100%|████████████████████████████████████| 7003/7003 [00:00<00:00, 15899.66it/s]
100%|████████████████████████████████████| 8776/8776 [00:00<00:00, 16042.33it/s]


## Calculating TF-IDF

In [5]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(save_dir + 'tf_idf_table_train.csv', index_label='word')
tf_idf_table_train.head()

100%|███████████████████████████████████████████| 52/52 [00:10<00:00,  4.82it/s]


Unnamed: 0,Historical,War,Young Adult,Science Fiction Fantasy,Mystery,Spirituality,Science Fiction,Science,Thriller,Magical Realism,...,Womens,Novels,The United States Of America,Race,Childrens,Sociology,Humor,Writing,Religion,Womens Fiction
ivi,0.000351,1.4e-05,0.000163,0.000104,9.3e-05,0.0,3.2e-05,0.0,5.7e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3e-05
row,2.3e-05,1e-05,4.7e-05,0.0,2.6e-05,0.0,3.7e-05,1.5e-05,0.000106,0.0,...,0.0,0.0,0.0,0.000225,0.0,4.6e-05,0.0,0.0,0.0,1.8e-05
birth,0.000128,5.1e-05,3.6e-05,8.1e-05,3.6e-05,5.2e-05,5.9e-05,8.4e-05,4.6e-05,3.4e-05,...,0.000132,3.6e-05,0.0,0.0,0.0,4.9e-05,2.9e-05,4.6e-05,8.8e-05,4.8e-05
blue,3.6e-05,3e-05,8.3e-05,7.3e-05,6.3e-05,2.1e-05,3.6e-05,5.2e-05,4.9e-05,0.000103,...,0.0,5.6e-05,0.000161,0.000242,0.000566,4.2e-05,0.000101,3.9e-05,2.2e-05,8.9e-05
star,4.7e-05,3.5e-05,7.3e-05,5.6e-05,6.8e-05,2e-06,0.00012,3.4e-05,5.3e-05,2.9e-05,...,4.7e-05,7.4e-05,0.000206,1.5e-05,3.2e-05,1.5e-05,7.4e-05,4.7e-05,1.4e-05,9.3e-05


In [6]:
# Save the training data
processed_data_train = (is_english_description_train.to_frame()
                        .merge(cleaned_descriptions_train,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_train,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(save_dir + 'goodreads_books_train_processed.csv', 
                            index_label='index')

# Save the validation
processed_data_val = (is_english_description_val.to_frame()
                        .merge(cleaned_descriptions_val,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_val,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_val.to_csv(save_dir + 'goodreads_books_val_processed.csv', 
                          index_label='index')

# Save the test data
processed_data_test = (is_english_description_test.to_frame()
                        .merge(cleaned_descriptions_test,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_test,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_test.to_csv(save_dir + 'goodreads_books_test_processed.csv', 
                           index_label='index')