# Preprocessing

In [3]:
import os
import bestreads.text as text
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/goodreads_books.csv')
data_train = data.sample(frac = 0.9, random_state = 111)
data_test = data.drop(data_train.index)

## Train-Test Set Splits

In [4]:
savedir = '../data/processed/'
os.makedirs(savedir, exist_ok=True)
data_train.to_csv(savedir + 'goodreads_books_train.csv', index = False)
data_test.to_csv(savedir + 'goodreads_books_test.csv', index = False)
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)

## Text Cleaning

In [5]:
is_english_description = text.is_english(data_train['description'])
english_descriptions = data_train.description[is_english_description]
cleaned_descriptions = text.clean_text(english_descriptions)

## Grouping Descriptions by Genre

In [6]:
genre_and_votes = text.get_genres(data_train.genre_and_votes[is_english_description])
combined = text.combine_genres(genre_and_votes.genre_1, cleaned_descriptions)

100%|██████████| 39528/39528 [00:00<00:00, 39783.17it/s]


## Calculating TF-IDF

In [7]:
tf_idf_table_train = text.tf_idf(combined)

# Here, we save the index because each index is a word
tf_idf_table_train.to_csv(savedir + 'tf_idf_table_train.csv')
tf_idf_table_train.head()

100%|██████████| 186/186 [00:49<00:00,  3.77it/s]


Unnamed: 0,Religion,Horror,Womens,Computer Science,Football,Drama,Contemporary,Environment,Roman,History,...,Marriage,Politics,Family Law,Linguistics,Christianity,Comics,Productivity,Asian Literature,Adult Fiction,Currency
much,0.00066,0.000458,0.0,0.000255,0.000622,0.001743,0.001144,0.000498,0.0,0.000642,...,0.0,0.000721,0.0,0.0,0.002286,0.001339,0.0,0.0,0.000977,0.0
public,0.000501,0.000151,0.0,0.000653,0.000797,0.000248,0.000342,0.001278,0.0,0.000686,...,0.0,0.00148,0.0,0.0,0.000733,0.0,0.0,0.0,0.000327,0.0
debat,0.000414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000565,...,0.0,0.000922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unit,0.000323,0.000344,0.0,0.0,0.00199,0.0,0.000488,0.000399,0.0,0.001392,...,0.0,0.002366,0.0,0.0,0.000457,0.0,0.0,0.0,0.000204,0.0
state,0.000614,0.000271,0.0,0.0,0.003238,0.0,0.000497,0.001622,0.0,0.001628,...,0.0,0.003849,0.0,0.0,0.000372,0.0,0.0,0.0,0.000166,0.0


In [8]:
processed_data_train = (is_english_description.to_frame()
                        .merge(cleaned_descriptions,
                               left_index=True, right_index=True,
                               validate='1:1')
                        .merge(genre_and_votes,
                               left_index=True, right_index=True,
                               validate='1:1'))

# Here, we save the index because some rows are now missing due to
# english language selection
processed_data_train.to_csv(savedir + 'goodreads_books_train_processed.csv')