# Pre-processing and Training Data Development

In [28]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from category_encoders import TargetEncoder

In [2]:
path = r"C:\Users\adame\OneDrive\Documents\GitHub\springboard_repository\Capstone_books\Capstone_books\data\processed\cleaned_books.csv"
df = pd.read_csv(path)

In [3]:
print(df.shape)
df.head()

(11123, 15)


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,average_rating_mean,ratings_count_sum,text_reviews_count_sum
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,2006-09-16,Scholastic Inc.,4.57,2096903,27669
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,2004-09-01,Scholastic Inc.,4.49,2153167,29221
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,2003-11-01,Scholastic,4.42,2300296,34936
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,2004-05-01,Scholastic Inc.,4.56,2342726,36465
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,2004-09-13,Scholastic,4.78,41428,164


### Encode categorical columns

In [5]:
le = LabelEncoder()
enc = OneHotEncoder(sparse=False, sparse_output=False, drop='first')
cat_enc = TargetEncoder()

In [6]:
# encode book title column
df['title_encoded'] = le.fit_transform(df['title'])

In [7]:
# encode author column
df['authors_encoded'] = le.fit_transform(df['authors'])

In [8]:
# one hot encode `language_code`
language_code_encoded = enc.fit_transform(df[['language_code']])

language_code_encoded_df = pd.DataFrame(language_code_encoded, columns=enc.get_feature_names_out(['language_code']))
df = pd.concat([df, language_code_encoded_df], axis=1)



In [9]:
# Target encode `publisher` column
df['publisher_encoded'] = cat_enc.fit_transform(df['publisher'], df['average_rating'])

### Split data into train and test datasets

In [22]:
X = df.drop(columns=['title', 'authors', 'language_code', 'average_rating', 'average_rating_mean', 'bookID', 'isbn13', 'isbn', 'publication_date', 'publisher'])
y = df.average_rating

In [25]:
X.shape

(11123, 34)

In [26]:
y.shape

(11123,)

In [27]:
# split dataframe into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Scale the data

In [29]:
# create a scaler object, fit scaler to data and then transform the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)