In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import linear_model

In [7]:
df = pd.read_csv('../data/clean_dataset.csv', index_col=0)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1143017 entries, 0 to 34758
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Name          1143017 non-null  object 
 1   Authors       1143017 non-null  object 
 2   Rating        1143017 non-null  float64
 3   PublishYear   1143017 non-null  int64  
 4   Publisher     1143017 non-null  object 
 5   PagesNumber   1143017 non-null  int64  
 6   Description   1143017 non-null  object 
 7   TotalReviews  1143017 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 78.5+ MB


## Define y_target and split train/test dataset

In [12]:
input_columns = [column for column in df.columns if column != 'TotalReviews']
X, y = df[input_columns], df.TotalReviews
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [46]:
def train_evaluate_model(model, X_train=None, y_train=None, X_test=None, y_test=None):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = {
        'r2_score' : r2_score(y_test, y_pred),
        'mean_squared_error': mean_squared_error(y_test, y_pred)
    }
    return report

## Model training and evaluation
### DummyModel

In [48]:
dummy_regressor = DummyRegressor(strategy='median')
train_evaluate_model(dummy_regressor,
                     X_train=X_train,
                     y_train=y_train,
                     X_test=X_test,
                     y_test=y_test)

array([[5.]])

### Ordinary Least Squares

In [49]:
lm = linear_model.LinearRegression()
train_evaluate_model(lm,
                     X_train=X_train[['PublishYear', 'PagesNumber']],
                     y_train=y_train,
                     X_test=X_test[['PublishYear', 'PagesNumber']],
                     y_test=y_test)

{'r2_score': 4.310562601905765e-05, 'mean_squared_error': 3424175037.595814}

In [50]:
lm.coef_

array([-6.47391167e+01,  4.96558144e-03])