In [None]:
import bigframes.pandas as bf

bf.options.bigquery.location = "EU"
bf.options.bigquery.project = "book-project-479914"

df = bf.read_gbq("book-project-479914.trial_and_error.m_no_nulls")

import pandas as pd

df = pd.DataFrame(df, columns=[
    'title', 'author', 'review',
    'reviews_count', 'published_year', 'price_eur',
    'new_length', 'awards', 'bestseller', 'book_series',
    'years_on_bestsellers_list', 'classics'
    ]).astype({
        'title':object,
        'author':object,
        'review':float,
        'reviews_count':int,
        'published_year':int,
        'price_eur':float,
        'new_length':int,
        'awards':int,
        'bestseller':int,
        'book_series':int,
        'years_on_bestsellers_list':int,
        'classics':int
    })

In [None]:
## ATTEMPT 1 - basic model

In [None]:
## target and features definition
X = df.drop(columns=['title', 'author','classics'])
y = df.pop('classics')

## train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])
X_test_scaled = scaler.transform(X_test[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])

## modelling and prediction
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

## model evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy = model.score(X_test_scaled, y_test)
precision = precision_score(model.predict(X_test_scaled), y_test)
recall = recall_score(model.predict(X_test_scaled), y_test)

print(f'accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')

In [None]:
## ATTEMPT 2 - adjust weight balance in the model

In [None]:
## target and features definition
X = df.drop(columns=['title', 'author','classics'])
y = df.pop('classics')

## train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])
X_test_scaled = scaler.transform(X_test[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])

## modelling and prediction
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

## model evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy = model.score(X_test_scaled, y_test)
precision = precision_score(model.predict(X_test_scaled), y_test)
recall = recall_score(model.predict(X_test_scaled), y_test)

print(f'accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')

In [None]:
## ATTEMPT 3 - use a different model that handles imbalance better

In [None]:
## target and features definition
X = df.drop(columns=['title', 'author','classics'])
y = df.pop('classics')

## train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])
X_test_scaled = scaler.transform(X_test[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])

## modelling and prediction
from lightgbm import LGBMClassifier
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

## model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy = model.score(X_test_scaled, y_test)
precision = precision_score(model.predict(X_test_scaled), y_test)
recall = recall_score(model.predict(X_test_scaled), y_test)

print(f'accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')

In [None]:
## ATTEMPT 4 - use Random Forest to handle imbalance

In [None]:
## target and features definition
X = df.drop(columns=['title', 'author','classics'])
y = df.pop('classics')

## train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])
X_test_scaled = scaler.transform(X_test[['review', 'reviews_count', 'published_year','price_eur','new_length','awards','bestseller','book_series','years_on_bestsellers_list']])

## modelling and prediction
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

## model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy = model.score(X_test_scaled, y_test)
precision = precision_score(model.predict(X_test_scaled), y_test)
recall = recall_score(model.predict(X_test_scaled), y_test)

print(f'accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')