In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("rental_info.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15861 entries, 0 to 15860
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rental_date       15861 non-null  object 
 1   return_date       15861 non-null  object 
 2   amount            15861 non-null  float64
 3   release_year      15861 non-null  float64
 4   rental_rate       15861 non-null  float64
 5   length            15861 non-null  float64
 6   replacement_cost  15861 non-null  float64
 7   special_features  15861 non-null  object 
 8   NC-17             15861 non-null  int64  
 9   PG                15861 non-null  int64  
 10  PG-13             15861 non-null  int64  
 11  R                 15861 non-null  int64  
 12  amount_2          15861 non-null  float64
 13  length_2          15861 non-null  float64
 14  rental_rate_2     15861 non-null  float64
dtypes: float64(8), int64(4), object(3)
memory usage: 1.8+ MB
None


In [3]:
df["rental_date"] = pd.to_datetime(df["rental_date"])
df["return_date"] = pd.to_datetime(df["return_date"])
df["rent_length"] = (df["return_date"] - df["rental_date"]).dt.days
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15861 entries, 0 to 15860
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   rental_date       15861 non-null  datetime64[ns, UTC]
 1   return_date       15861 non-null  datetime64[ns, UTC]
 2   amount            15861 non-null  float64            
 3   release_year      15861 non-null  float64            
 4   rental_rate       15861 non-null  float64            
 5   length            15861 non-null  float64            
 6   replacement_cost  15861 non-null  float64            
 7   special_features  15861 non-null  object             
 8   NC-17             15861 non-null  int64              
 9   PG                15861 non-null  int64              
 10  PG-13             15861 non-null  int64              
 11  R                 15861 non-null  int64              
 12  amount_2          15861 non-null  float64            
 13  l

In [4]:
df["deleted_scenes"] = np.where(df["special_features"].str.contains("Deleted Scenes"), 1, 0)
df["behind_the_scenes"] = np.where(df["special_features"].str.contains("Behind the Scenes"), 1, 0)
df.head(1)

Unnamed: 0,rental_date,return_date,amount,release_year,rental_rate,length,replacement_cost,special_features,NC-17,PG,PG-13,R,amount_2,length_2,rental_rate_2,rent_length,deleted_scenes,behind_the_scenes
0,2005-05-25 02:54:33+00:00,2005-05-28 23:40:33+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401,3,0,1


In [5]:
cols_to_drop = ["special_features", "rent_length", "rental_date", "return_date"]
X = df.drop(cols_to_drop, axis = 1)
Y = df["rent_length"]

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 9)

In [7]:
lasso = Lasso(alpha=0.3, random_state=9)
lasso.fit(X_train, Y_train)
print(lasso.coef_)
X_lasso_train, X_lasso_test = X_train.iloc[:, lasso.coef_ > 0], X_test.iloc[:, lasso.coef_ > 0]

[ 5.84104424e-01  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  4.36220109e-02  3.01167812e-06 -1.52983561e-01
 -0.00000000e+00  0.00000000e+00]


In [8]:
ols = LinearRegression()
ols.fit(X_lasso_train, Y_train)
Y_pred_ols = ols.predict(X_lasso_test)
rmse_ols = MSE(Y_test,Y_pred_ols)
rmse_ols

4.812297241276236

In [9]:
rf = RandomForestRegressor()
print(rf.get_params())
param_dist = {'n_estimators': np.arange(1, 101, 1),
              'max_depth':np.arange(1, 11, 1)}
random_search = RandomizedSearchCV(rf, cv = 5, random_state=9, param_distributions=param_dist)
random_search.fit(X_train, Y_train)

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [10]:
rand_params = random_search.best_params_
rand_model = random_search.best_estimator_
print(rand_params)
print(rand_model)

{'n_estimators': np.int64(51), 'max_depth': np.int64(10)}
RandomForestRegressor(max_depth=np.int64(10), n_estimators=np.int64(51))


In [11]:
rand_model.fit(X_train, Y_train)
Y_pred_rand = rand_model.predict(X_test)
rmse_rand = MSE(Y_test,Y_pred_rand)
rmse_rand

2.2253869080571724

In [13]:
#hence best model comes from the RandomForestRegressor
best_model = rand_model
best_mse = rmse_rand

print(best_model)
print(best_mse)

RandomForestRegressor(max_depth=np.int64(10), n_estimators=np.int64(51))
2.2253869080571724
