# Preprocessing

In [1]:
import os
import bestreads.text as text
import pandas as pd

data = pd.read_csv('./data/goodreads_books.csv')
data_test = data.sample(frac = 0.2, random_state = 111)
data_train = data.drop(data_test.index)

data_val = data_train.sample(frac=0.2, random_state=112)
data_train = data_train.drop(data_val.index)

## Train-Test Set Splits

In [2]:
save_dir = './data/processed/'
os.makedirs(save_dir, exist_ok=True)
data_train.to_csv(save_dir + 'goodreads_books_train.csv', index = False)
data_val.to_csv(save_dir + 'goodreads_books_val.csv', index=False)
data_test.to_csv(save_dir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_val.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [3]:
is_english_description_train = text.is_english(data_train['description'])
english_descriptions_train = data_train.description[is_english_description_train]
cleaned_descriptions_train = text.clean_text(english_descriptions_train)
is_english_description_train = is_english_description_train.rename('is_english_description',)
cleaned_descriptions_train = cleaned_descriptions_train.rename('cleaned_descriptions')

is_english_description_val = text.is_english(data_val['description'])
english_descriptions_val = data_val.description[is_english_description_val]
cleaned_descriptions_val = text.clean_text(english_descriptions_val)
is_english_description_val = is_english_description_val.rename('is_english_description',)
cleaned_descriptions_val = cleaned_descriptions_val.rename('cleaned_descriptions')

is_english_description_test = text.is_english(data_test['description'])
english_descriptions_test = data_test.description[is_english_description_test]
cleaned_descriptions_test = text.clean_text(english_descriptions_test)
is_english_description_test = is_english_description_test.rename('is_english_description')
cleaned_descriptions_test = cleaned_descriptions_test.rename('cleaned_descriptions')

## Grouping Descriptions by Genre

In [4]:
genre_and_votes_train = text.get_genres(data_train.genre_and_votes[is_english_description_train])
genre_and_votes_val = text.get_genres(data_val.genre_and_votes[is_english_description_val])
genre_and_votes_test = text.get_genres(data_test.genre_and_votes[is_english_description_test])
combined = text.combine_genres(genre_and_votes_train.genre_1, cleaned_descriptions_train)

100%|██████████████████████████████████| 28154/28154 [00:01<00:00, 18072.01it/s]
100%|████████████████████████████████████| 7005/7005 [00:00<00:00, 18206.23it/s]
100%|████████████████████████████████████| 8777/8777 [00:00<00:00, 18143.92it/s]


## Calculating TF-IDF

In [5]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(save_dir + 'tf_idf_table_train.csv', index_label='word')
tf_idf_table_train.head()

100%|███████████████████████████████████████████| 52/52 [00:11<00:00,  4.53it/s]


Unnamed: 0,Speculative Fiction,Historical,Writing,Reference,War,Shapeshifters,Race,Plays,Horror,Holiday,...,Politics,Poetry,LGBT,Literature,Paranormal,Science Fiction,Romance,Fantasy,Space,Literary Fiction
thi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
futur,0.000277,4.5e-05,2.9e-05,5e-06,3.8e-05,5.2e-05,1.7e-05,4.3e-05,1.4e-05,0.0,...,5.4e-05,1e-05,2.8e-05,1.4e-05,6e-06,0.000125,6.3e-05,6.5e-05,0.000108,2.2e-05
peopl,6.4e-05,4e-05,3e-05,2e-05,2.9e-05,1.8e-05,5.9e-05,2.2e-05,4.6e-05,0.0,...,4e-05,8e-06,2.3e-05,2.7e-05,1.2e-05,3.4e-05,2.2e-05,2.6e-05,2.8e-05,2.7e-05
need,5.9e-05,3.7e-05,4.4e-05,0.000131,7.5e-05,0.000195,1.2e-05,2.6e-05,8.5e-05,0.0,...,7.3e-05,2.3e-05,0.000131,1.7e-05,7.9e-05,8e-05,0.000156,9.7e-05,4.7e-05,4e-05
sleep,0.000193,3.5e-05,3.6e-05,0.000123,5e-05,5.3e-05,0.0,2.1e-05,0.000124,0.000222,...,0.0,1.7e-05,0.0,2.4e-05,5.4e-05,2.3e-05,4e-05,0.0,8.5e-05,0.0


In [6]:
# Save the training data
processed_data_train = (is_english_description_train.to_frame()
                        .merge(cleaned_descriptions_train,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_train,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(save_dir + 'goodreads_books_train_processed.csv', 
                            index_label='index')

# Save the validation
processed_data_val = (is_english_description_val.to_frame()
                        .merge(cleaned_descriptions_val,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_val,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_val.to_csv(save_dir + 'goodreads_books_val_processed.csv', 
                          index_label='index')

# Save the test data
processed_data_test = (is_english_description_test.to_frame()
                        .merge(cleaned_descriptions_test,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_test,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_test.to_csv(save_dir + 'goodreads_books_test_processed.csv', 
                           index_label='index')