In [13]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import PoissonRegressor
from sklearn.svm import LinearSVR,NuSVR,SVR
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from random import uniform,randint
from sklearn.metrics import make_scorer
from scipy.stats import truncnorm

In [14]:
linear_models = ['LinearRegression', 'Ridge', 'SGDRegressor', 'ElasticNet,Lars',
        'Lasso', 'LassoLars','OrthogonalMatchingPursuit','ARDRegression','BayesianRidge','HuberRegressor',
        'RANSACRegressor', 'TheilSenRegressor', 'PoissonRegressor','TweedieRegressor','GammaRegressor',
        'PassiveAggressiveRegressor']
models = []
for model in linear_models:
    try:
        exec(f"from sklearn.linear_model import {model}")
    except ImportError:
        print(f"Error importing {model}")

In [15]:
df = pd.read_pickle('finaldata.pkl')

In [16]:
for col in df.columns:
    if(df[col].dtype == 'bool'):
        df[col] = pd.factorize(df[col])[0] 
df = df.reset_index(drop=True)
df= df.drop(columns='listingInfo.buyItNowAvailable')

In [17]:
df = df.reset_index(drop=True)

In [18]:
def create_training_and_test_data(df):
    Y_targets = np.array(df['Price'].values)
    X = np.array(df.drop(columns='Price').values)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled,Y_targets,test_size=0.2,random_state=42,
                                                   shuffle=True)
    return X_train,X_test,y_train,y_test

In [19]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [20]:
X_train, X_test, y_train, y_test = create_training_and_test_data(df)

In [21]:
print(X_train.shape)
print(X_test.shape)

(14394, 81)
(3599, 81)


In [42]:
best_forest_params = {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 
                               'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 
                               'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1,
                               'min_samples_split': 7, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 709, 
                               'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 
                               'warm_start': False}
best_svr_params = {'C': 1.7429383440550505, 'cache_size': 200, 'coef0': 0.0, 'degree': 4, 
                   'epsilon': 0.7128563457476645,'gamma': 'auto', 'kernel': 'rbf', 
                   'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False} 
best_Nusvr_params = {'C': 2.4196539131928545, 'cache_size': 200, 'coef0': 0.0, 'degree': 2, 
                      'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'nu': 0.7766835423023327, 'shrinking': True,
                      'tol': 0.001, 'verbose': False}
best_arg_params = {'alpha_1': 4.3378736853599286e-05, 'alpha_2': 8.19823665091751e-06, 'compute_score': False, 
                   'copy_X': True, 'fit_intercept': True, 'lambda_1': 1.3636729392523562e-05, 
                   'lambda_2': 9.760664648063037e-05, 'n_iter': 886, 'normalize': False, 
                   'threshold_lambda': 10000.0, 'tol': 0.001, 'verbose': False}
best_huber_params = {'alpha': 6.1516451862359435e-06, 'epsilon': 2.076940479476908, 
                      'fit_intercept': True, 'max_iter': 272, 'tol': 1e-05, 'warm_start': False}
best_gbr_params = {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None,
                   'learning_rate': 0.19104322054197975, 'loss': 'huber', 
                   'max_depth': 3, 'max_features': 52, 'max_leaf_nodes': None,
                   'min_impurity_decrease': 0.0, 'min_impurity_split': None, 
                   'min_samples_leaf': 1, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.0,
                   'n_estimators': 626, 'n_iter_no_change': None, 'presort': 'deprecated',
                   'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1,
                   'verbose': 0, 'warm_start': False}

In [43]:
models = [RandomForestRegressor(**best_forest_params),GradientBoostingRegressor(**best_gbr_params),
         SVR(**best_svr_params),NuSVR(**best_Nusvr_params)]

In [44]:
def run_models(models,X_train,y_train,X_test,y_test):
    y_test_squared = np.square(y_test)
    y_train_squared = np.square(y_train)
    x_train_squared = np.square(X_train)
    x_test_squared = np.square(X_test)
    
    total_time_start = time.time()
    model_evaluation = pd.DataFrame(columns=['Model','Train MAPE','Train RMSE','Train CV Mean','Train CV STD',
                                            'Test MAPE','Test RMSE','Time Taken'])
    for index,model in enumerate(models):
        print(f'Now doing : {model} : This is model : {index + 1} out of {len(models)} ')
        df_values = []
        start = time.time()
        current_model = model.fit(X_train,y_train)
        model_train_predictions = current_model.predict(X_train)
        model_train_predictions_squared = np.square(model_train_predictions)
        train_mse = mean_squared_error(y_train_squared,model_train_predictions_squared)
        train_rmse = np.sqrt(train_mse)
        model_train_cv_score = cross_val_score(current_model, X_train, y_train,
                        scoring='neg_mean_squared_error',cv=10)
        
        model_train_scores_mean = model_train_cv_score.mean()
        train_scores_rmse = np.sqrt(-model_train_scores_mean)
        model_train_scores_std = model_train_cv_score.std()
        model_train_mape = mean_absolute_percentage_error(y_train_squared, model_train_predictions_squared)
        
        model_test_predictions = current_model.predict(X_test)
        model_test_predictions_squared = np.square(model_test_predictions)
        model_test_mse = mean_squared_error(y_test_squared, model_test_predictions_squared)
        test_rmse = np.sqrt(model_test_mse)
        model_test_mape = mean_absolute_percentage_error(y_test_squared, model_test_predictions_squared)
        end = time.time()
        time_taken = end - start
        df_values.extend([model,model_train_mape,train_rmse,train_scores_rmse,
                          model_train_scores_std,model_test_mape,test_rmse,time_taken])
        series_to_append = pd.Series(df_values,index=model_evaluation.columns)
        model_evaluation = model_evaluation.append(series_to_append,ignore_index=True)
    total_time_end = time.time()
    print(f'Time taken : {total_time_end - total_time_start}')
    return model_evaluation

In [45]:
df_models = run_models(models,X_train,y_train,X_test,y_test)

Now doing : RandomForestRegressor(min_samples_split=7, n_estimators=709) : This is model : 1 out of 4 
Now doing : GradientBoostingRegressor(learning_rate=0.19104322054197975, loss='huber',
                          max_features=52, min_samples_split=3,
                          n_estimators=626) : This is model : 2 out of 4 
Now doing : SVR(C=1.7429383440550505, degree=4, epsilon=0.7128563457476645, gamma='auto') : This is model : 3 out of 4 
Now doing : NuSVR(C=2.4196539131928545, degree=2, nu=0.7766835423023327) : This is model : 4 out of 4 
Time taken : 2134.8998198509216


In [46]:
df_models

Unnamed: 0,Model,Train MAPE,Train RMSE,Train CV Mean,Train CV STD,Test MAPE,Test RMSE,Time Taken
0,"(DecisionTreeRegressor(max_features='auto', mi...",9.327739,13.686296,1.230752,0.108687,19.068479,25.296075,1021.853305
1,([DecisionTreeRegressor(criterion='friedman_ms...,16.465519,22.953973,1.216065,0.084793,18.706582,24.735779,299.280925
2,"SVR(C=1.7429383440550505, degree=4, epsilon=0....",19.555793,24.660098,1.361314,0.104074,23.2805,27.716683,192.863178
3,"NuSVR(C=2.4196539131928545, degree=2, nu=0.776...",16.67887,22.865564,1.343026,0.106029,22.471016,27.309376,620.843489


In [52]:
df_models.sort_values(by='Test RMSE')

Unnamed: 0,Model,Train MAPE,Train RMSE,Train CV Mean,Train CV STD,Test MAPE,Test RMSE,Time Taken
1,([DecisionTreeRegressor(criterion='friedman_ms...,16.465519,22.953973,1.216065,0.084793,18.706582,24.735779,299.280925
0,"(DecisionTreeRegressor(max_features='auto', mi...",9.327739,13.686296,1.230752,0.108687,19.068479,25.296075,1021.853305
3,"NuSVR(C=2.4196539131928545, degree=2, nu=0.776...",16.67887,22.865564,1.343026,0.106029,22.471016,27.309376,620.843489
2,"SVR(C=1.7429383440550505, degree=4, epsilon=0....",19.555793,24.660098,1.361314,0.104074,23.2805,27.716683,192.863178


In [47]:
from sklearn.ensemble import VotingRegressor

In [48]:
rfg = RandomForestRegressor(**best_forest_params)
gbr = GradientBoostingRegressor(**best_gbr_params)
Voting_model_2 = VotingRegressor([('RFR',rfg),('GBR',gbr)])
model2_fit = Voting_model_2.fit(X_train,y_train)

In [49]:
predictions = model2_fit.predict(X_test)
predictions_squared = np.square(predictions)
y_test_squared = np.square(y_test)

In [50]:
np.sqrt(mean_squared_error(y_test_squared,predictions_squared))

24.380719686374213

In [51]:
mean_absolute_percentage_error(y_test_squared,predictions_squared)

18.35047126747842