In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("train.csv")
x = data.drop(["Unnamed: 0","median_house_value"],axis=1)
y = pd.DataFrame(data["median_house_value"])

In [3]:
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-117.97,34.15,33.0,2474.0,472.0,1268.0,437.0,6.4576,INLAND
1,-117.92,33.62,37.0,2038.0,379.0,837.0,381.0,5.2416,<1H OCEAN
2,-122.33,37.57,43.0,2543.0,621.0,1301.0,606.0,3.1111,NEAR OCEAN
3,-122.53,37.92,45.0,1530.0,324.0,608.0,328.0,3.8750,NEAR BAY
4,-117.37,34.01,15.0,1386.0,247.0,703.0,185.0,3.6415,INLAND
...,...,...,...,...,...,...,...,...,...
16507,-123.35,39.42,18.0,1619.0,346.0,904.0,295.0,2.1625,<1H OCEAN
16508,-121.43,38.56,46.0,1316.0,244.0,452.0,245.0,3.0938,INLAND
16509,-121.30,37.92,28.0,3308.0,766.0,3201.0,720.0,1.7694,INLAND
16510,-119.00,35.35,35.0,1164.0,277.0,992.0,284.0,1.4015,INLAND


In [4]:
y

Unnamed: 0,median_house_value
0,500001.0
1,471300.0
2,318400.0
3,390800.0
4,124200.0
...,...
16507,77200.0
16508,137800.0
16509,73900.0
16510,48700.0


In [5]:
ocean_proximity = { "<1H OCEAN": 0, "INLAND": 1, "ISLAND": 2, "NEAR BAY": 3, "NEAR OCEAN": 4}
x["ocean_proximity"] = x["ocean_proximity"].replace(ocean_proximity)
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-117.97,34.15,33.0,2474.0,472.0,1268.0,437.0,6.4576,1
1,-117.92,33.62,37.0,2038.0,379.0,837.0,381.0,5.2416,0
2,-122.33,37.57,43.0,2543.0,621.0,1301.0,606.0,3.1111,4
3,-122.53,37.92,45.0,1530.0,324.0,608.0,328.0,3.8750,3
4,-117.37,34.01,15.0,1386.0,247.0,703.0,185.0,3.6415,1
...,...,...,...,...,...,...,...,...,...
16507,-123.35,39.42,18.0,1619.0,346.0,904.0,295.0,2.1625,0
16508,-121.43,38.56,46.0,1316.0,244.0,452.0,245.0,3.0938,1
16509,-121.30,37.92,28.0,3308.0,766.0,3201.0,720.0,1.7694,1
16510,-119.00,35.35,35.0,1164.0,277.0,992.0,284.0,1.4015,1


In [6]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
x = imputer.fit_transform(x)

In [7]:
x_train, x_valid , y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=43)

In [8]:
x_train

array([[-118.42  ,   34.    ,   14.    , ...,  978.    ,    3.1603,
           0.    ],
       [-118.44  ,   34.2   ,   35.    , ...,  495.    ,    2.5197,
           0.    ],
       [-118.3   ,   33.91  ,   34.    , ...,  500.    ,    2.6182,
           0.    ],
       ...,
       [-118.45  ,   35.58  ,   16.    , ...,  807.    ,    1.8819,
           1.    ],
       [-122.4   ,   37.6   ,   52.    , ...,  210.    ,    6.221 ,
           4.    ],
       [-117.09  ,   32.75  ,   30.    , ...,  493.    ,    1.6034,
           4.    ]])

In [9]:
scaler = preprocessing.MinMaxScaler()

# Gradient Boosting

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

In [11]:
gbr = GradientBoostingRegressor()

n_estimators = [15,50,250,1000]

learning_rate = [0.1,0.01,0.001]

max_depth = [3,5,11]

gbr_params = {
 'Regressor__n_estimators': n_estimators,
 'Regressor__learning_rate': learning_rate,
 'Regressor__max_depth':max_depth
 }

gbr_pipe = Pipeline([('scaler',scaler),('Regressor', gbr)])

boost_grid = GridSearchCV(estimator = gbr_pipe, param_grid = gbr_params, 
                                cv = 5, verbose=2, scoring='r2',n_jobs = -1)

boost_grid.fit(x_train, y_train.values.ravel())

print(boost_grid.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'Regressor__learning_rate': 0.1, 'Regressor__max_depth': 5, 'Regressor__n_estimators': 1000}


In [12]:
model_gbr = GradientBoostingRegressor(n_estimators=1000,max_depth=5,learning_rate=0.1)
gbr_pipe = Pipeline([('scaler',scaler),('Regressor', model_gbr)])

score_gbr = cross_val_score(gbr_pipe, x_train, y_train, cv = 5, scoring = "r2")
gbr_pipe.fit(x_train,y_train.values.ravel())

print(score_gbr)

[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__n_estimators=15; total time=   0.3s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__n_estimators=250; total time=   4.7s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__n_estimators=1000; total time=  22.5s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=11, Regressor__n_estimators=15; total time=   1.2s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=11, Regressor__n_estimators=15; total time=   1.1s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=11, Regressor__n_estimators=50; total time=   3.5s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=11, Regressor__n_estimators=250; total time=  23.0s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=11, Regressor__n_estimators=250; total time=  17.4s
[CV] END Regressor__learning_rate=0.01, Regressor__max_depth=3, Regressor__n_estimators=250; total time=   5.1

[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__n_estimators=15; total time=   0.3s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=3, Regressor__n_estimators=250; total time=   4.8s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=15; total time=   0.5s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=15; total time=   0.5s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=15; total time=   0.5s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=50; total time=   1.6s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=250; total time=  10.7s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=250; total time=   8.5s
[CV] END Regressor__learning_rate=0.1, Regressor__max_depth=5, Regressor__n_estimators=1000; total time=  38.4s
[CV] 

[0.83197523 0.82456374 0.8146382  0.81362387 0.83994015]


In [13]:
score_gbr.mean()

0.8249482399509862

# Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train,y_train)
nb.score(x_valid,y_valid)

0.007266121707538601

In [16]:
nb_pipe = Pipeline([('scaler',scaler),('Regressor', nb)])
score_nb = cross_val_score(nb_pipe, x_train, y_train, cv = 5, scoring = "r2")
score_nb.mean()

0.3107359656652626

# KNN

In [17]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(0.1)

knn = KNeighborsRegressor()

knn_pipe =  Pipeline([("selector", selector),("scaler", scaler), ("knn", knn)])

In [18]:
#These is cross validation score for knn model
score_knn = cross_val_score(knn_pipe, x_train, y_train, cv = 5, scoring = 'r2')
knn_pipe.fit(x_train, y_train)
print(score_knn.mean())

0.6881183324487369


In [20]:
knn_param_grid = { 
    "knn__n_neighbors": np.arange(1,50,1),
    "knn__metric": ('minkowski', 'chebyshev'),
    'knn__p': (1,2),
}

In [21]:
#These model find best hyperparameter

knn_model = GridSearchCV(estimator = knn_pipe,
                         param_grid= knn_param_grid,
                         scoring = 'r2',
                         cv=5,
                         refit=True)

knn_model.fit(x_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selector',
                                        VarianceThreshold(threshold=0.1)),
                                       ('scaler', MinMaxScaler()),
                                       ('knn', KNeighborsRegressor())]),
             param_grid={'knn__metric': ('minkowski', 'chebyshev'),
                         'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                         'knn__p': (1, 2)},
             scoring='r2')

In [22]:
#we find the best score and best hyperparameters
#cv score increse 0.7 to 0.72

print(knn_model.best_score_)
print(knn_model.best_estimator_)
print(knn_model.best_params_)

0.7164062775409248
Pipeline(steps=[('selector', VarianceThreshold(threshold=0.1)),
                ('scaler', MinMaxScaler()),
                ('knn', KNeighborsRegressor(n_neighbors=9, p=1))])
{'knn__metric': 'minkowski', 'knn__n_neighbors': 9, 'knn__p': 1}


In [23]:
#Final model with best parameter
knn_tuned_model= KNeighborsRegressor(metric="minkowski", n_neighbors=9, p=1).fit(x_train,y_train)

# Random Forest Regressor

In [24]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf_pipe =  Pipeline([("selector", selector),("scaler", scaler), ("rf", rf)])

In [25]:
#These is cross validation score for random forest model
scores = cross_val_score(rf_pipe, x_train, y_train, cv = 5, scoring = 'r2')
rf_pipe.fit(x_train, y_train)

print(scores.mean())

0.8003702536635018


In [26]:
#These is r2 result 

r2_score=r2_score(y_valid,rf_pipe.predict(x_valid))

print('r2_score')
print(r2_score)

r2_score
0.8246240475309595


In [27]:
#These is grid for find the best possible hyperparameter 

rf_param_grid = { 
    'rf__n_estimators': [100, 200, 300, 400, 500],
    'rf__max_features': ['auto', 'sqrt'],
    'rf__max_depth' : [4,5,6,7,8],
    "rf__random_state":[1]
}


In [28]:
#These model find best hyperparameter

rf_model = GridSearchCV(estimator = rf_pipe,
                         param_grid= rf_param_grid,
                         scoring = 'r2',
                         return_train_score=True,
                         cv"=5,
                         refit=True)

rf_model.fit(x_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selector',
                                        VarianceThreshold(threshold=0.1)),
                                       ('scaler', MinMaxScaler()),
                                       ('rf', RandomForestRegressor())]),
             param_grid={'rf__max_depth': [4, 5, 6, 7, 8],
                         'rf__max_features': ['auto', 'sqrt'],
                         'rf__n_estimators': [100, 200, 300, 400, 500],
                         'rf__random_state': [1]},
             return_train_score=True, scoring='r2')

In [29]:
print(rf_model.best_score_)
print(rf_model.best_estimator_)
print(rf_model.best_params_)

0.7482169910000159
Pipeline(steps=[('selector', VarianceThreshold(threshold=0.1)),
                ('scaler', MinMaxScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=8, n_estimators=500,
                                       random_state=1))])
{'rf__max_depth': 8, 'rf__max_features': 'auto', 'rf__n_estimators': 500, 'rf__random_state': 1}


In [30]:
#Final model with best parameter
rf_tuned_model= RandomForestRegressor(max_depth=8,max_features="auto", n_estimators=500,random_state=1).fit(x_train,y_train)

# Predicting Y-test

In [31]:
test = pd.read_csv("test.csv")
test

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,0,-118.08,33.83,30.0,2188.0,556.0,2727.0,525.0,2.7759,<1H OCEAN
1,1,-121.54,38.50,15.0,6093.0,1051.0,2415.0,997.0,4.2075,INLAND
2,2,-120.01,39.26,26.0,1930.0,391.0,307.0,138.0,2.6023,INLAND
3,3,-118.45,34.19,37.0,1073.0,254.0,739.0,253.0,2.4667,<1H OCEAN
4,4,-118.32,34.03,47.0,1082.0,198.0,455.0,193.0,3.0132,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
4123,4123,-119.33,36.19,27.0,418.0,163.0,332.0,141.0,1.0714,INLAND
4124,4124,-122.15,37.35,23.0,3814.0,485.0,1344.0,464.0,12.9792,NEAR BAY
4125,4125,-122.33,37.97,45.0,1982.0,376.0,1179.0,398.0,3.5463,NEAR BAY
4126,4126,-120.53,39.79,18.0,1234.0,266.0,543.0,201.0,2.5156,INLAND


In [42]:
x_test = test.drop(["Unnamed: 0"],axis=1)
x_test["ocean_proximity"] = x_test["ocean_proximity"].replace(ocean_proximity)
x_test = pd.DataFrame(imputer.fit_transform(x_test), columns=x_test.columns)
x_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-118.08,33.83,30.0,2188.0,556.0,2727.0,525.0,2.7759,0.0
1,-121.54,38.50,15.0,6093.0,1051.0,2415.0,997.0,4.2075,1.0
2,-120.01,39.26,26.0,1930.0,391.0,307.0,138.0,2.6023,1.0
3,-118.45,34.19,37.0,1073.0,254.0,739.0,253.0,2.4667,0.0
4,-118.32,34.03,47.0,1082.0,198.0,455.0,193.0,3.0132,0.0
...,...,...,...,...,...,...,...,...,...
4123,-119.33,36.19,27.0,418.0,163.0,332.0,141.0,1.0714,1.0
4124,-122.15,37.35,23.0,3814.0,485.0,1344.0,464.0,12.9792,3.0
4125,-122.33,37.97,45.0,1982.0,376.0,1179.0,398.0,3.5463,3.0
4126,-120.53,39.79,18.0,1234.0,266.0,543.0,201.0,2.5156,1.0


In [43]:
y_pred = gbr_pipe.predict(x_test)
y_pred

array([155066.64042832, 195818.86868194, 131744.37209652, ...,
       124290.74626267,  84260.28370214, 292774.88638277])

In [44]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

Unnamed: 0,median_house_value


In [45]:
sample_submission["median_house_value"]=pd.Series(y_pred)
sample_submission

Unnamed: 0,median_house_value
0,155066.640428
1,195818.868682
2,131744.372097
3,185207.847221
4,181325.988470
...,...
4123,92543.981125
4124,525387.881976
4125,124290.746263
4126,84260.283702


In [46]:
sample_submission.to_csv('sample_submission1.csv')