## Imports

In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

## Variables

In [2]:
cols_to_drop = ['Unnamed: 0', 'averageRating', 'numVotes', '_orig_order']

### Retrieve data

In [3]:
trainingDatasetPath = os.path.join('.', 'data', 'training_dataset.csv')
df = pd.read_csv(trainingDatasetPath, sep=";")
df = df.sort_values(by=['startYear', '_orig_order'])
df = df.drop(columns=cols_to_drop)
y = np.expm1(df["movie_score"].values)
X = df.drop(columns=["movie_score"]).values  # + your drop columns

### Train the best model with parameters and save it.

In [None]:
lasso = Lasso(max_iter=1000)
param_grid_lasso = {
    'alpha': np.logspace(-6, 3, 19)
}
lasso_grid_search = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='r2', n_jobs=-1)
lasso_grid_search.fit(X, y)



In [None]:
print(lasso_grid_search.best_estimator_)
print(lasso_grid_search.best_score_)
print(lasso_grid_search.best_params_)



In [None]:
joblib.dump(lasso_grid_search.best_estimator_, "Linear_Regression_Lasso.joblib")