In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

%matplotlib inline

In [57]:
df = pd.read_csv("AmesHousingWithAddress.csv")

df = df[["SalePrice", "Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
        "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
        "Year Built", "Year Remod/Add"]]

le = LabelEncoder()
df["Neighborhood"] = le.fit_transform(df["Neighborhood"])

df["Bsmt Full Bath"] = df["Bsmt Full Bath"].fillna(0)

df.head()

Unnamed: 0,SalePrice,Lot Area,Gr Liv Area,Total Bsmt SF,Open Porch SF,Wood Deck SF,Garage Area,Bedroom AbvGr,Full Bath,Bsmt Full Bath,Kitchen AbvGr,Fireplaces,Neighborhood,Year Built,Year Remod/Add
0,215000,31770,1656,1080,62,210,528,3,1,1.0,1,2,13,1960,1960
1,105000,11622,896,882,0,140,730,2,1,0.0,1,0,13,1961,1961
2,172000,14267,1329,1329,36,393,312,3,1,0.0,1,0,13,1958,1958
3,244000,11160,2110,2110,0,0,522,3,2,1.0,1,2,13,1968,1968
4,189900,13830,1629,928,34,212,482,3,2,0.0,1,1,8,1997,1998


In [54]:
# New data split testing

def conformal_prediction(df, regressor, gamma, confidence):
    
    data_0 = df.sample(frac = 0.2, axis = "index")

    data_1 = df[~df.isin(data_0)].dropna(how = "any").sample(frac = 0.25, axis = "index")

    data_2 = df[~df.isin(data_0)].dropna(how = "any")
    data_2 = data_2[~data_2.isin(data_1)].dropna(how = "any").sample(frac = 0.333, axis = "index")

    data_3 = df[~df.isin(data_0)].dropna(how = "any")
    data_3 = data_3[~data_3.isin(data_1)].dropna(how = "any")
    data_3 = data_3[~data_3.isin(data_2)].dropna(how = "any").sample(frac = 0.5, axis = "index")

    data_4 = df[~df.isin(data_0)].dropna(how = "any")
    data_4 = data_4[~data_4.isin(data_1)].dropna(how = "any")
    data_4 = data_4[~data_4.isin(data_2)].dropna(how = "any")
    data_4 = data_4[~data_4.isin(data_3)].dropna(how = "any")

    cross_val_list = [data_0, data_1, data_2, data_3, data_4]

    for i in range(len(cross_val_list)):
        
        train_data = cross_val_list[0:3]
        train_data = pd.concat([train_data[0], train_data[1], train_data[2]])
        cal_data = cross_val_list[3]
        test_data = cross_val_list[4]       
        
        X_train = train_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_train = train_data["SalePrice"]

        X_cal = cal_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_cal = cal_data["SalePrice"]

        X_test = test_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_test = test_data["SalePrice"]

        reg = regressor

        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_cal)
        new_pred = reg.predict(X_test)

        # Nonconformity Measure
        ncm = abs((y_cal - y_pred) / (gamma + y_pred))
        ncm = list(ncm)
        
        q = np.quantile(ncm, confidence)
        
        lower = new_pred - (q * (gamma + new_pred))
        upper = new_pred + (q * (gamma + new_pred))
        global pred_intervals
        pred_intervals = list(zip(lower, upper))

        covered_count = 0
        not_covered_count = 0

        y_test.reset_index(drop = True, inplace = True)

        for i in range(len(y_test)):
            if (y_test[i] >= pred_intervals[i][0]) & (y_test[i] <= pred_intervals[i][1]):
                covered_count += 1
            else:
                not_covered_count += 1

        global coverage
        coverage = covered_count / (covered_count + not_covered_count)

        global int_size
        int_size = pred_intervals[0][1] - pred_intervals[0][0]

        cross_val_list.append(cross_val_list.pop(0))

In [55]:
regressors = [LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]
confidences = [0.9, 0.95, 0.99]
gammas = [0, 25000, 50000, 75000, 100000, 125000]

In [56]:
n_trials = 100

for reg in regressors:
    for conf in confidences:
        for gamma in gammas:
            total_cvg = 0
            int_size_list = []
            for trial in range(n_trials):
                conformal_prediction(df, reg, gamma, conf)
                total_cvg += coverage
                int_size_list.append(int_size)
                #total_int_size += int_size
            print("Model: " + str(reg))
            print("Confidence: " + str(conf*100) + "%")
            print("Gamma Value: " + str(gamma))
            print("Average Coverage: " + str(total_cvg / n_trials))
            print("Median Interval Width: " + str(np.median(int_size_list)))
            print("--------------------------------------------------------------------")
            #print("Average Interval Width: " + str(total_int_size / n_trials))

Model: LinearRegression()
Confidence: 90.0%
Gamma Value: 0
Average Coverage: 0.901605206073753
Median Interval Width: 84312.87800410838
--------------------------------------------------------------------
Model: LinearRegression()
Confidence: 90.0%
Gamma Value: 25000
Average Coverage: 0.8965943600867677
Median Interval Width: 86972.12167100928
--------------------------------------------------------------------
Model: LinearRegression()
Confidence: 90.0%
Gamma Value: 50000
Average Coverage: 0.8985466377440351
Median Interval Width: 87016.91649733376
--------------------------------------------------------------------
Model: LinearRegression()
Confidence: 90.0%
Gamma Value: 75000
Average Coverage: 0.8956832971800435
Median Interval Width: 85804.46321601086
--------------------------------------------------------------------
Model: LinearRegression()
Confidence: 90.0%
Gamma Value: 100000
Average Coverage: 0.8993275488069418
Median Interval Width: 81241.54691444666
-----------------------

Model: GradientBoostingRegressor()
Confidence: 90.0%
Gamma Value: 75000
Average Coverage: 0.8975704989154012
Median Interval Width: 59351.01240684223
--------------------------------------------------------------------
Model: GradientBoostingRegressor()
Confidence: 90.0%
Gamma Value: 100000
Average Coverage: 0.8965075921908893
Median Interval Width: 62372.85192333127
--------------------------------------------------------------------
Model: GradientBoostingRegressor()
Confidence: 90.0%
Gamma Value: 125000
Average Coverage: 0.9017570498915402
Median Interval Width: 62959.8084424946
--------------------------------------------------------------------
Model: GradientBoostingRegressor()
Confidence: 95.0%
Gamma Value: 0
Average Coverage: 0.9452277657266805
Median Interval Width: 75001.35390375463
--------------------------------------------------------------------
Model: GradientBoostingRegressor()
Confidence: 95.0%
Gamma Value: 25000
Average Coverage: 0.9459436008676786
Median Interval Wi

In [None]:
denom_list = [(y_test - y_pred), (gamma + y_pred), (np.exp(gamma * y_pred)), ]

In [None]:
for denom in denom_list:
    ncm = abs((y_cal - y_pred) / (denom))

In [158]:
# Original Mixed Model

def mixed_models_conf_pred(df, gamma, confidence):

    data_0 = df.iloc[0:460]
    data_1 = df.iloc[460:920]
    data_2 = df.iloc[920:1380]
    data_3 = df.iloc[1380:1840]
    data_4 = df.iloc[1840:]

    cross_val_list = [data_0, data_1, data_2, data_3, data_4]

    for i in range(len(cross_val_list)):
        train_data = cross_val_list[0:3]
        train_data = pd.concat([train_data[0], train_data[1], train_data[2]])
        cal_data = cross_val_list[3]
        test_data = cross_val_list[4]       
        
        X_train = train_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_train = train_data["SalePrice"]

        X_cal = cal_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_cal = cal_data["SalePrice"]

        X_test = test_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_test = test_data["SalePrice"]
        
        models = [LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]
    
        cal_lr_list = []
        cal_rf_list = []
        cal_gb_list = []
        
        for model in models:
            model.fit(X_cal, y_cal)
            cal_pred = model.predict(X_cal)
            cal_pred = (1/3) * cal_pred
            if model == models[0]:
                cal_lr_list = cal_pred
            elif model == models[1]:
                cal_rf_list = cal_pred
            elif model == models[2]:
                cal_gb_list = cal_pred
        
        zipped_cal_list = list(zip(cal_lr_list, cal_rf_list, cal_gb_list))
        #zipped_list = list(zipped)
        
        combined_cal_list = []
        for i in range(len(zipped_cal_list)):
            combined_cal_list.append(sum(zipped_cal_list[i]))

        new_lr_list = []
        new_rf_list = []
        new_gb_list = []
            
        for model in models:
            model.fit(X_train, y_train)
            new_pred = model.predict(X_test)
            new_pred = (1/3) * new_pred
            if model == models[0]:
                new_lr_list = new_pred
            elif model == models[1]:
                new_rf_list = new_pred
            elif model == models[2]:
                new_gb_list = new_pred
        
        print(new_lr_list)
        print("-----------------------------------------")
        print(new_rf_list)
        print("-----------------------------------------")
        print(new_gb_list)
        print("-----------------------------------------")
        print(len(new_lr_list))
        print(len(new_rf_list))
        print(len(new_gb_list))
        mod
        zipped_new_list = list(zip(new_lr_list, new_rf_list, new_gb_list))
        
        combined_new_list = []
        for i in range(len(zipped_new_list)):
            combined_new_list.append(sum(zipped_new_list[i]))

        #print(combined_new_list)
        #print("---------------------------------------------------------------------------------------------------------")
        #print(combined_cal_list)
        
        cal_pred_array = np.array(combined_cal_list)
        test_pred_array = np.array(combined_new_list)

        # Nonconformity Measure
        ncm = abs((y_cal - cal_pred_array) / (gamma + cal_pred_array))
        ncm = list(ncm)
        
        q = np.quantile(ncm, confidence)
        
        lower = test_pred_array - (q * (gamma + test_pred_array))
        upper = test_pred_array + (q * (gamma + test_pred_array))
        global pred_intervals
        pred_intervals = list(zip(lower, upper))

        covered_count = 0
        not_covered_count = 0

        y_test.reset_index(drop = True, inplace = True)

        for i in range(len(y_test)):
            if (y_test[i] >= pred_intervals[i][0]) & (y_test[i] <= pred_intervals[i][1]):
                covered_count += 1
            else:
                not_covered_count += 1

        global coverage
        coverage = covered_count / (covered_count + not_covered_count)

        global int_size
        int_size = pred_intervals[0][1] - pred_intervals[0][0]

        cross_val_list.append(cross_val_list.pop(0))

In [159]:
# Original Mixed Model Parameter List

#regressors = [LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]
confidences = [0.9, 0.95, 0.99]
gammas = [0, 25000, 50000, 75000, 100000, 125000]

In [160]:
# Original Mixed Model Testing


n_trials = 100

#for reg in regressors:
for conf in confidences:
    for gamma in gammas:
        total_cvg = 0
        int_size_list = []
        for trial in range(n_trials):
            mixed_models_conf_pred(df, gamma, conf)
            total_cvg += coverage
            int_size_list.append(int_size)
            #total_int_size += int_size
        print("Model: Mixed Model")
        print("Confidence: " + str(conf*100) + "%")
        print("Gamma Value: " + str(gamma))
        print("Average Coverage: " + str(total_cvg / n_trials))
        print("Median Interval Width: " + str(np.median(int_size_list)))
        print("--------------------------------------------------------------------")
        #print("Average Interval Width: " + str(total_int_size / n_trials))

[ 56504.25011706  32218.29908553  37715.8840314   28436.90835645
  32196.33271754  27679.06090916  54206.35061506  58764.16431149
  70495.81352824  83826.07875173  57414.3066677   29034.91758214
  40337.91770273  46717.30884712  57297.838949    37357.43905356
  42392.00400852  37610.7007694   52748.51196796  46761.62962563
  76701.87998679  77957.61861011  64382.81403768  80267.72237021
  63088.08152619  57992.15741143  67796.44652534  63201.33874802
  78992.47459608  76453.6141204  107131.78908211  53413.48345459
  56552.73937781  64674.54327943  56677.90119153  77313.27146766
  85859.10502435  63605.70216688  48124.14711266  90523.35797458
 105459.08646364  69685.17510791  80488.63041995  84361.38409289
  63921.34425544  58130.07410184  85296.035498    64745.28865596
  80832.59768942  97317.77919007  59119.48452414  58372.10649852
  64473.1743511   49829.31547886  55942.97286878  61917.25532164
  46325.17757023  27938.18087148  36967.99770032  75901.91402664
  34166.87424835  47352.1

NameError: name 'mod' is not defined

In [161]:
# New Edited Mixed Model

def mixed_models_conf_pred(df, gamma, confidence):

    data_0 = df.iloc[0:460]
    data_1 = df.iloc[460:920]
    data_2 = df.iloc[920:1380]
    data_3 = df.iloc[1380:1840]
    data_4 = df.iloc[1840:]

    cross_val_list = [data_0, data_1, data_2, data_3, data_4]

    for i in range(len(cross_val_list)):
        train_data = cross_val_list[0:3]
        train_data = pd.concat([train_data[0], train_data[1], train_data[2]])
        cal_data = cross_val_list[3]
        test_data = cross_val_list[4]       
        
        X_train = train_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_train = train_data["SalePrice"]

        X_cal = cal_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_cal = cal_data["SalePrice"]

        X_test = test_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_test = test_data["SalePrice"]
        
        models = [LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]
    
        cal_lr_list = []
        cal_rf_list = []
        cal_gb_list = []
        
        for model in models:
            model.fit(X_train, y_train)
            cal_pred = model.predict(X_cal)
            cal_pred = (1/3) * cal_pred
            if model == models[0]:
                cal_lr_list = cal_pred
            elif model == models[1]:
                cal_rf_list = cal_pred
            elif model == models[2]:
                cal_gb_list = cal_pred
        
        zipped_cal_list = list(zip(cal_lr_list, cal_rf_list, cal_gb_list))
        #zipped_list = list(zipped)
        
        combined_cal_list = []
        for i in range(len(zipped_cal_list)):
            combined_cal_list.append(sum(zipped_cal_list[i]))

        new_lr_list = []
        new_rf_list = []
        new_gb_list = []
            
        for model in models:
            model.fit(X_train, y_train)
            new_pred = model.predict(X_test)
            new_pred = (1/3) * new_pred
            if model == models[0]:
                new_lr_list = new_pred
            elif model == models[1]:
                new_rf_list = new_pred
            elif model == models[2]:
                new_gb_list = new_pred
        
        #print(new_lr_list)
        #print("-----------------------------------------")
        #print(new_rf_list)
        #print("-----------------------------------------")
        #print(new_gb_list)
        #print("-----------------------------------------")
        #print(len(new_lr_list))
        #print(len(new_rf_list))
        #print(len(new_gb_list))
        #mod
        zipped_new_list = list(zip(new_lr_list, new_rf_list, new_gb_list))
        
        combined_new_list = []
        for i in range(len(zipped_new_list)):
            combined_new_list.append(sum(zipped_new_list[i]))

        #print(combined_new_list)
        #print("---------------------------------------------------------------------------------------------------------")
        #print(combined_cal_list)
        
        cal_pred_array = np.array(combined_cal_list)
        test_pred_array = np.array(combined_new_list)

        # Nonconformity Measure
        ncm = abs((y_cal - cal_pred_array) / (gamma + cal_pred_array))
        ncm = list(ncm)
        
        q = np.quantile(ncm, confidence)
        
        lower = test_pred_array - (q * (gamma + test_pred_array))
        upper = test_pred_array + (q * (gamma + test_pred_array))
        global pred_intervals
        pred_intervals = list(zip(lower, upper))

        covered_count = 0
        not_covered_count = 0

        y_test.reset_index(drop = True, inplace = True)

        for i in range(len(y_test)):
            if (y_test[i] >= pred_intervals[i][0]) & (y_test[i] <= pred_intervals[i][1]):
                covered_count += 1
            else:
                not_covered_count += 1

        global coverage
        coverage = covered_count / (covered_count + not_covered_count)

        global int_size
        int_size = pred_intervals[0][1] - pred_intervals[0][0]

        cross_val_list.append(cross_val_list.pop(0))

In [162]:
# New Edited Mixed Model Parameter List

#regressors = [LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]
confidences = [0.9, 0.95, 0.99]
gammas = [0, 25000, 50000, 75000, 100000, 125000]

In [163]:
# New Edited Mixed Model Testing

n_trials = 100

#for reg in regressors:
for conf in confidences:
    for gamma in gammas:
        total_cvg = 0
        int_size_list = []
        for trial in range(n_trials):
            mixed_models_conf_pred(df, gamma, conf)
            total_cvg += coverage
            int_size_list.append(int_size)
            #total_int_size += int_size
        print("Model: Mixed Model")
        print("Confidence: " + str(conf*100) + "%")
        print("Gamma Value: " + str(gamma))
        print("Average Coverage: " + str(total_cvg / n_trials))
        print("Median Interval Width: " + str(np.median(int_size_list)))
        print("--------------------------------------------------------------------")
        #print("Average Interval Width: " + str(total_int_size / n_trials))

Model: Mixed Model
Confidence: 90.0%
Gamma Value: 0
Average Coverage: 0.8960652173913054
Median Interval Width: 44220.907075366944
--------------------------------------------------------------------
Model: Mixed Model
Confidence: 90.0%
Gamma Value: 25000
Average Coverage: 0.901195652173914
Median Interval Width: 45768.34316580181
--------------------------------------------------------------------
Model: Mixed Model
Confidence: 90.0%
Gamma Value: 50000
Average Coverage: 0.9011086956521747
Median Interval Width: 47048.95796568077
--------------------------------------------------------------------
Model: Mixed Model
Confidence: 90.0%
Gamma Value: 75000
Average Coverage: 0.8983478260869576
Median Interval Width: 47796.26103653578
--------------------------------------------------------------------
Model: Mixed Model
Confidence: 90.0%
Gamma Value: 100000
Average Coverage: 0.8965217391304359
Median Interval Width: 48540.669250090505
--------------------------------------------------------

KeyboardInterrupt: 

In [1]:
from sklearn.neighbors import NearestNeighbors

In [None]:
def conformal_prediction(df, regressor, gamma, confidence):

    data_0 = df.iloc[0:460]
    data_1 = df.iloc[460:920]
    data_2 = df.iloc[920:1380]
    data_3 = df.iloc[1380:1840]
    data_4 = df.iloc[1840:]

    cross_val_list = [data_0, data_1, data_2, data_3, data_4]

    for i in range(len(cross_val_list)):
        train_data = cross_val_list[0:3]
        train_data = pd.concat([train_data[0], train_data[1], train_data[2]])
        cal_data = cross_val_list[3]
        test_data = cross_val_list[4]       
        
        X_train = train_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_train = train_data["SalePrice"]

        X_cal = cal_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_cal = cal_data["SalePrice"]

        X_test = test_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_test = test_data["SalePrice"]

        reg = regressor

        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_cal)
        new_pred = reg.predict(X_test)

In [None]:
new_train_data = X_train.merge(y_train)
new_test_data = X_test.merge(y_test)

neigh = NearestNeighbors(n_neighbors = 2)
neigh.fit(new_train_data)

neigh.kneighbors(new_test_data)

In [None]:
def ncm_29(df, regressor, gamma, confidence):

    data_0 = df.iloc[0:460]
    data_1 = df.iloc[460:920]
    data_2 = df.iloc[920:1380]
    data_3 = df.iloc[1380:1840]
    data_4 = df.iloc[1840:]

    cross_val_list = [data_0, data_1, data_2, data_3, data_4]

    for i in range(len(cross_val_list)):
        train_data = cross_val_list[0:3]
        train_data = pd.concat([train_data[0], train_data[1], train_data[2]])
        cal_data = cross_val_list[3]
        test_data = cross_val_list[4]       
        
        X_train = train_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_train = train_data["SalePrice"]

        X_cal = cal_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_cal = cal_data["SalePrice"]

        X_test = test_data[["Lot Area", "Gr Liv Area", "Total Bsmt SF", "Open Porch SF", "Wood Deck SF", "Garage Area", 
                            "Bedroom AbvGr", "Full Bath", "Bsmt Full Bath", "Kitchen AbvGr", "Fireplaces", "Neighborhood", 
                            "Year Built", "Year Remod/Add"]]
        y_test = test_data["SalePrice"]

        reg = regressor

        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_cal)
        new_pred = reg.predict(X_test)
        
        