In [1]:
import time
import numpy 
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import PoissonRegressor
from sklearn.svm import LinearSVR,NuSVR,SVR
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, median_absolute_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from random import uniform,randint
from scipy.stats import truncnorm

In [2]:
linear_models = ['LinearRegression', 'Ridge', 'SGDRegressor', 'ElasticNet,Lars',
        'Lasso', 'LassoLars','OrthogonalMatchingPursuit','ARDRegression','BayesianRidge','HuberRegressor',
        'RANSACRegressor', 'TheilSenRegressor', 'PoissonRegressor','TweedieRegressor','GammaRegressor',
        'PassiveAggressiveRegressor']
models = []
for model in linear_models:
    try:
        exec(f"from sklearn.linear_model import {model}")
    except ImportError:
        print(f"Error importing {model}")

In [None]:
df = pd.read_pickle('finaldata.pkl')

In [None]:
for col in df.columns:
    if(df[col].dtype == 'bool'):
        df[col] = pd.factorize(df[col])[0] 
df = df.reset_index(drop=True)
df= df.drop(columns='listingInfo.buyItNowAvailable')

In [None]:
def create_training_and_test_data(df):
    Y_targets = np.array(df['Price'].values)
    X = np.array(df.drop(columns='Price').values)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled,Y_targets,test_size=0.2,random_state=42,
                                                   shuffle=True)
    return X_train,X_test,y_train,y_test

In [None]:
X_train, X_test, y_train, y_test = create_training_and_test_data(df)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
models = [RandomForestRegressor(),GradientBoostingRegressor(),SVR(),NuSVR(),
          Ridge(),BayesianRidge(),ARDRegression(),HuberRegressor(),LinearSVR(),
         ]

max_features = [randint(50,82) for p in range(0, 4)]
max_features.append(None)

list_of_param_grids = [{
    'n_estimators' : numpy.random.randint(10,1000,5).astype(int),
    ## normally distributed with mean.0.25, std = 0.1 and range[0,1]
    'min_samples_split' : numpy.random.randint(2,10,5).astype(int)
    },
    {'loss' : ['ls','lad','huber'],
    'learning_rate' : numpy.random.uniform(0.001,0.3,5),
    'n_estimators' : numpy.random.randint(10,1000,5),
    'min_samples_split' : numpy.random.randint(2,10,5),
    'max_features' : max_features
    },
    {'C' : numpy.random.uniform(0.1,10,5),
    'kernel' : ['linear','poly','rbf','sigmoid'],
    'degree' : numpy.random.randint(1,5,5),
    'gamma' : ['scale','auto'],
    'epsilon' : numpy.random.uniform(0.1,3,5)
    },  
    {'C' : numpy.random.uniform(0.1,10,5),
    'kernel' : ['linear','poly','rbf','sigmoid'],
    'degree' : numpy.random.randint(1,5,5),
    'gamma' : ['scale','auto'],
    'nu' : numpy.random.uniform(0.1,1,5)
    },
    {'n_iter' : numpy.random.randint(300,1000,5),
    'alpha_1' : numpy.random.uniform(1e-8,1e-4,5),
    'alpha_2' : numpy.random.uniform(1e-8,1e-4,5),
    'lambda_1' : numpy.random.uniform(1e-8,1e-4,5),
    'lambda_2' : numpy.random.uniform(1e-8,1e-4,5),
    },
    {'alpha' : numpy.random.uniform(1,1000,5)},
    {'n_iter' : numpy.random.randint(300,1000,5),
    'alpha_1' : numpy.random.uniform(1e-8,1e-4,5),
    'alpha_2' : numpy.random.uniform(1e-8,1e-4,5),
    'lambda_1' : numpy.random.uniform(1e-8,1e-4,5),
    'lambda_2' : numpy.random.uniform(1e-8,1e-4,5),
    },
    {
    'epsilon' : numpy.random.uniform(1.01,2.7,5),
    'max_iter' : numpy.random.randint(100,300,5),
    'alpha' : numpy.random.uniform(0.00001,0.00,5),
    },
    {
    'C' : numpy.random.uniform(0.1,10,5),
    'epsilon' : numpy.random.uniform(0,1,5),
    'dual' : [True,False],
    },  
]

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def fine_tune_models(models,params,X_train,y_train,X_test,y_test):
    y_test_squared = np.square(y_test)
    for i in range(len(models)):
        df = pd.DataFrame(columns=['Model','CV Score','Test RMSE (£)','Test MAPE (%)','Best Params','Time Taken'])
        start = time.time()
        row_to_add = []
        model = models[i]
        param = params[i]
        filename = f'{model}.pkl'
        print(f'Doing Model : {model} : Number {i+1} : Out of {len(models)}')
        grid_search_model = model
        reg = RandomizedSearchCV(grid_search_model,param,scoring='neg_mean_squared_error',
                        cv=10,random_state=42, return_train_score=True,
                        verbose=1,n_iter=100)
        param_search = reg.fit(X_train,y_train)
        best_params = param_search.best_estimator_.get_params()
        
        best_cv_score = np.sqrt(-param_search.best_score_)
        
        predictions = param_search.predict(X_test)
        predictions_squared = np.square(predictions)
        
        test_mse = mean_squared_error(y_test_squared,predictions_squared)
        test_rmse = np.sqrt(test_mse)
        test_mape = mean_absolute_percentage_error(y_test_squared, predictions_squared)
             
        end = time.time()
        time_taken = end - start
        
        row_to_add.extend([model,best_cv_score,test_rmse,test_mape,best_params,time_taken])
        series_to_append = pd.Series(row_to_add,index=df.columns)
        df = df.append(series_to_append,ignore_index=True)
        df.to_pickle(filename)

In [None]:
fine_tune_models(models,list_of_param_grids,X_train,y_train,X_test,y_test)