# Preprocessing

In [1]:
import os
import bestreads.text as text
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/goodreads_books.csv')
data_train = data.sample(frac = 0.9, random_state = 111)
data_test = data.drop(data_train.index)

## Train-Test Set Splits

In [2]:
savedir = '../data/processed/'
os.makedirs(savedir, exist_ok=True)
data_train.to_csv(savedir + 'goodreads_books_train.csv', index = False)
data_test.to_csv(savedir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [3]:
is_english_description_train = text.is_english(data_train['description'])
english_descriptions_train = data_train.description[is_english_description_train]
cleaned_descriptions_train = text.clean_text(english_descriptions_train)
is_english_description_train = is_english_description_train.rename('is_english_description',)
cleaned_descriptions_train = cleaned_descriptions_train.rename('cleaned_descriptions')

is_english_description_test = text.is_english(data_test['description'])
english_descriptions_test = data_test.description[is_english_description_test]
cleaned_descriptions_test = text.clean_text(english_descriptions_test)
is_english_description_test = is_english_description_test.rename('is_english_description')
cleaned_descriptions_test = cleaned_descriptions_test.rename('cleaned_descriptions')

## Grouping Descriptions by Genre

In [4]:
genre_and_votes_train = text.get_genres(data_train.genre_and_votes[is_english_description_train])
genre_and_votes_test = text.get_genres(data_test.genre_and_votes[is_english_description_test])
combined = text.combine_genres(genre_and_votes_train.genre_1, cleaned_descriptions_train)

100%|██████████| 39529/39529 [00:00<00:00, 42109.55it/s]
100%|██████████| 4408/4408 [00:00<00:00, 41277.34it/s]


## Calculating TF-IDF

In [10]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(savedir + 'tf_idf_table_train.csv', index_label='word')
tf_idf_table_train.head()

100%|██████████| 186/186 [00:53<00:00,  3.49it/s]


Unnamed: 0,Social Issues,Textbooks,Philosophy,Suspense,World War II,Adult Fiction,Paranormal,Epic,Womens Fiction,Love,...,New York,Realistic Fiction,Linguistics,Teaching,Fan Fiction,Inspirational,Combat,Biography Memoir,Historical,Nurses
jimmi,0.029893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000211,0.0,...,0.0,0.000599,0.0,0.0,0.0,0.0,0.0,0.0,0.000105,0.0
reev,0.066318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hi,0.009979,0.001744,0.003034,0.004364,0.001421,0.003667,0.002613,0.003647,0.000906,0.000769,...,0.001648,0.002439,0.0,0.000898,0.002154,0.0,0.01031,0.000422,0.002976,0.001988
luck,0.02138,0.0,0.0,0.0,0.000666,0.000102,8.8e-05,0.0,0.000252,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000144,0.0
wildlif,0.032555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1e-05,0.0


In [6]:
# Save the training data
processed_data_train = (is_english_description_train.to_frame()
                        .merge(cleaned_descriptions_train,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_train,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(savedir + 'goodreads_books_train_processed.csv', 
                            index_label='index')

# Save the test data
processed_data_test = (is_english_description_test.to_frame()
                        .merge(cleaned_descriptions_test,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes_test,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_test.to_csv(savedir + 'goodreads_books_test_processed.csv', 
                           index_label='index')