In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('../data/clean_dataset.csv', index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1065137 entries, 0 to 1065136
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Name          1065137 non-null  object 
 1   Authors       1065137 non-null  object 
 2   PublishYear   1065137 non-null  int64  
 3   Publisher     1065137 non-null  object 
 4   Rating        1065137 non-null  float64
 5   PagesNumber   1065137 non-null  float64
 6   TotalReviews  1065137 non-null  int64  
 7   Description   1065137 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 73.1+ MB


## Define y_target and split train/test dataset

In [17]:
REVIEWS_THRESHOLD = 500
print(f'Proportion of popular books in the dataset: {round((df.TotalReviews > REVIEWS_THRESHOLD).sum() / df.index.size, 2)}')
(df.TotalReviews > REVIEWS_THRESHOLD).sum()

Proportion of popular books in the dataset: 0.07


75122

In [18]:
input_columns = [column for column in df.columns if column != 'TotalReviews']
X, y = df[input_columns], df.TotalReviews > REVIEWS_THRESHOLD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
def train_evaluate_model(model, X_train=None, y_train=None, X_test=None, y_test=None):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = {
        'r2_score' : r2_score(y_test, y_pred),
        'mean_squared_error': mean_squared_error(y_test, y_pred)
    }
    return report

## Model training and evaluation
### DummyModel

In [7]:
dummy_regressor = DummyRegressor(strategy='median')
train_evaluate_model(dummy_regressor,
                     X_train=X_train,
                     y_train=y_train,
                     X_test=X_test,
                     y_test=y_test)

{'r2_score': -0.002780119422400862, 'mean_squared_error': 3433842671.061521}

### Ordinary Least Squares

In [8]:
lm = linear_model.LinearRegression()
train_evaluate_model(lm,
                     X_train=X_train[['PublishYear', 'PagesNumber']],
                     y_train=y_train,
                     X_test=X_test[['PublishYear', 'PagesNumber']],
                     y_test=y_test)

{'r2_score': 4.310562601905765e-05, 'mean_squared_error': 3424175037.595814}

In [9]:
lm.coef_

array([-6.47391167e+01,  4.96558144e-03])

## CountVectorizer

In [33]:
vectorizer = CountVectorizer(stop_words={'english'},
                             max_df=0.05,
                             min_df=50)

In [None]:
X_train['Des']

In [34]:
vectorizer.fit(X_train['Description'])

In [35]:
len(vectorizer.vocabulary_)

47422

In [36]:
X_train_vec = vectorizer.transform(X_train['Description'])

In [37]:
X_test_vec = vectorizer.transform(X_test['Description'])

In [38]:
X_train_vec

<1085866x47422 sparse matrix of type '<class 'numpy.int64'>'
	with 54669218 stored elements in Compressed Sparse Row format>

In [39]:
lm = linear_model.LinearRegression()
train_evaluate_model(lm,
    X_train=X_train_vec,
    y_train=y_train,
    X_test=X_test_vec,
    y_test=y_test)

{'r2_score': 0.18742755105302322, 'mean_squared_error': 2782510237.7680407}

In [53]:
title = "aklsjdaklsjsdlkasjd"

In [54]:
x_vec = vectorizer.transform([title])

In [55]:
lm.predict(x_vec)

array([2465.3786121])