# Preprocessing

In [1]:
import os
import bestreads.text as text
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('./data/goodreads_books.csv')
data_train = data.sample(frac = 0.9, random_state = 111)
data_test = data.drop(data_train.index)

## Train-Test Set Splits

In [2]:
savedir = './data/processed/'
os.makedirs(savedir, exist_ok=True)
data_train.to_csv(savedir + 'goodreads_books_train.csv', index = False)
data_test.to_csv(savedir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [3]:
data_train = text.add_english_column(data_train)
english_descriptions = data_train.description[data_train.english_description]
cleaned_descriptions = text.clean_text(english_descriptions)

## Grouping Descriptions by Genre

In [4]:
genre_and_votes = text.get_genres(data_train.genre_and_votes[data_train.english_description])
combined = text.combine_genres(genre_and_votes.genre_1, cleaned_descriptions)

100%|████████████████████████████████████████████████| 39530/39530 [00:01<00:00, 30413.75it/s]


## Calculating TF-IDF

In [5]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(savedir + 'tf_idf_table_train.csv')
tf_idf_table_train.head()

100%|███████████████████████████████████████████████████████| 186/186 [00:55<00:00,  3.32it/s]


Unnamed: 0,Adult,Realistic Fiction,Dungeons and Dragons,Music,Philosophy,Anthropology,Family,Polyamorous,Historical,Parenting,...,Science Fiction,Central Africa,Military History,Currency,Law,Manga,History,Academic,Military,Audiobook
karma,0.004611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bitch,0.003954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jgail,0.009057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ruin,0.00202,0.000227,0.0,0.000162,5.1e-05,0.0,0.001812,0.0,0.000316,0.0,...,0.000328,0.0,0.0,0.0,0.0,0.0,0.000115,0.0,0.0,0.0
5th,0.002182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4e-05,0.0,...,3.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029276,0.0


In [6]:
processed_data_train = (data_train.english_description.to_frame()
                        .merge(cleaned_descriptions,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(savedir + 'goodreads_books_train_processed.csv')