# Data Modelling

### Imports

In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

### Model Load

In [2]:
expected_model_version = '1.0'
model_path = r'..\Model\my_model.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
   # if model.version != expected_model_version:
    #    print("Expected model version doesn't match version loaded")
    #if model.sklearn_version != sklearn_version:
     #   print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

### Data Load

In [3]:
test_data = pd.read_csv(r'..\Data\test_data.csv', index_col = 0)
test_data = test_data.reset_index()
X_test = test_data.drop(columns = ['index'], axis = 1)
X_test

Unnamed: 0,Id,Price,Bedroom_Count,Bathroom_Count,Sqft_Living,Sqft_Plot,Floor_Count,Waterfront,View,Condition,...,Change_In_Sqft_Plot,Listing_per_Zip_Code,Zipcode_Median_housePrice,sqft_living_zipcode_ratio,sqft_lot_zipcode_ratio,sqft_above_zipcode_ratio,sqft_basement_zipcode_ratio,sqft_living15_zipcode_ratio,sqft_lot15_zipcode_ratio,sqft_median_price_zipcode_ratio
0,1445500100,900000.0,5,2.25,2510,35691,1.0,0,0,3,...,-654,168,765475.0,0.005623,0.010660,0.006916,0.000000,0.007325,0.011354,1.175741
1,5515600163,420000.0,5,2.25,3070,64033,1.0,0,0,3,...,-35773,358,260000.0,0.004494,0.011951,0.004416,0.005232,0.002378,0.007048,1.615385
2,7300400320,340000.0,4,2.50,2810,6481,2.0,0,0,3,...,477,351,309780.0,0.003646,0.000568,0.003877,0.000000,0.003558,0.000656,1.097553
3,251200240,491500.0,4,2.75,2100,7236,1.0,0,0,3,...,283,543,446000.0,0.001956,0.001364,0.001551,0.004084,0.001829,0.001631,1.102018
4,1025039086,1875000.0,3,2.50,3280,29111,2.0,1,3,3,...,-8037,316,689900.0,0.004800,0.016935,0.006522,0.000000,0.005607,0.012626,2.717785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,4302200625,335000.0,3,1.75,1790,5120,1.0,0,0,4,...,0,330,315000.0,0.003637,0.002803,0.002391,0.008595,0.002579,0.003043,1.063492
211,8562780290,329950.0,2,2.25,1260,1032,2.0,0,0,3,...,-223,411,571000.0,0.001218,0.000076,0.001346,0.000545,0.001269,0.000069,0.577846
212,1762600090,1211000.0,4,2.50,3430,35120,2.0,0,0,3,...,110,431,678000.0,0.003345,0.007754,0.003867,0.000000,0.004121,0.008232,1.786136
213,2867300190,363000.0,4,2.50,3753,7204,2.0,0,0,3,...,2171,492,269000.0,0.003817,0.001395,0.003958,0.002969,0.003603,0.001954,1.349442


#### Data Preparation

In [4]:
y_test = X_test['Price']
X_test = X_test.drop(columns=['Price'], index= 1)

In [5]:
train_data = pd.read_csv(r'..\data\train_data.csv', index_col = 0)
train_data = train_data.reset_index()
y_train = train_data['Price']
X_train = train_data.drop(columns = ['index', 'Price'], axis = 1)

#### Fitting Final Model

In [6]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x0000012F0189E168>)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=16, n_jobs=None,
                                       oo

In [7]:
cv_results = cross_validate(model, X_train, y_train, scoring='r2', cv=5, n_jobs=-1)
cv_scores = cv_results['test_score']
cv_scores

array([0.99471513, 0.98130837, 0.98541493, 0.97103883, 0.99440846])

In [8]:
print("Mean Cross Validation Scores: ", np.mean(cv_scores))
print("Std of Cross Validation Scores: ", np.std(cv_scores))

Mean Cross Validation Scores:  0.9853771463849175
Std of Cross Validation Scores:  0.008842016663800138


#### Applying on Test Data

In [9]:
y_te_pred = model.predict(X_test)

In [10]:
results = X_test
results['original price'] = y_test
results['model price'] = y_te_pred

In [11]:
results

Unnamed: 0,Id,Bedroom_Count,Bathroom_Count,Sqft_Living,Sqft_Plot,Floor_Count,Waterfront,View,Condition,Grade,...,Zipcode_Median_housePrice,sqft_living_zipcode_ratio,sqft_lot_zipcode_ratio,sqft_above_zipcode_ratio,sqft_basement_zipcode_ratio,sqft_living15_zipcode_ratio,sqft_lot15_zipcode_ratio,sqft_median_price_zipcode_ratio,original price,model price
0,1445500100,5,2.25,2510,35691,1.0,0,0,3,9,...,765475.0,0.005623,0.010660,0.006916,0.000000,0.007325,0.011354,1.175741,900000.0,8.928675e+05
2,7300400320,4,2.50,2810,6481,2.0,0,0,3,9,...,309780.0,0.003646,0.000568,0.003877,0.000000,0.003558,0.000656,1.097553,340000.0,3.367450e+05
3,251200240,4,2.75,2100,7236,1.0,0,0,3,8,...,446000.0,0.001956,0.001364,0.001551,0.004084,0.001829,0.001631,1.102018,491500.0,4.919438e+05
4,1025039086,3,2.50,3280,29111,2.0,1,3,3,11,...,689900.0,0.004800,0.016935,0.006522,0.000000,0.005607,0.012626,2.717785,1875000.0,1.886656e+06
5,1328300990,3,1.75,1530,7650,1.0,0,0,4,8,...,335000.0,0.001651,0.000867,0.001875,0.000000,0.002072,0.001094,0.946269,317000.0,3.173125e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,4302200625,3,1.75,1790,5120,1.0,0,0,4,6,...,315000.0,0.003637,0.002803,0.002391,0.008595,0.002579,0.003043,1.063492,335000.0,3.352305e+05
211,8562780290,2,2.25,1260,1032,2.0,0,0,3,7,...,571000.0,0.001218,0.000076,0.001346,0.000545,0.001269,0.000069,0.577846,329950.0,3.302406e+05
212,1762600090,4,2.50,3430,35120,2.0,0,0,3,10,...,678000.0,0.003345,0.007754,0.003867,0.000000,0.004121,0.008232,1.786136,1211000.0,1.213750e+06
213,2867300190,4,2.50,3753,7204,2.0,0,0,3,10,...,269000.0,0.003817,0.001395,0.003958,0.002969,0.003603,0.001954,1.349442,363000.0,3.813383e+05


In [12]:
path = r'..\Reports\results.csv'
results.to_csv(path)