In [311]:
import pandas as pd

In [312]:
train_df = pd.read_csv("train.csv")

In [313]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [314]:
train_df.columns[train_df.isna().any()].to_list()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [315]:
train_df["diff_yr_sold_built"] = abs(train_df["YrSold"] - train_df["YearBuilt"])

In [289]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [290]:
le = LabelEncoder()
oe = OneHotEncoder()

In [303]:
def label_encode(df=None, columns=None):
    for col in columns:
        df[col] = le.fit_transform(df[col])
        
def impute_categorical_data(df=None, columns=None, column_name=None, strategy="most_frequent", imputer=SimpleImputer):
    si_cat = imputer(strategy=strategy)
    if column_name:
        df[column_name] = si_cat.fit_transform(df[[column_name]])
    else:
        for col in columns:
            df[col] = si_cat.fit_transform(df[[col]])
            
def impute_numerical_data(df=None, columns=None, column_name=None, strategy="mean", imputer=SimpleImputer):
    si_num = imputer(strategy=strategy)
    print(si_num)
    if column_name:
        df[column_name] = si_num.fit_transform(df[[column_name]])
    else:
        for col in columns:
            df[col] = si_num.fit_transform(df[[col]])
    

def get_corr_matrix(df=None):
    return df.corr()

def get_cols_with_threshold_corr(df=None, column_for_corr_check="SalePrice", threshold=0.5):
    corr_matrix = get_corr_matrix(df=df)
    col_list = corr_matrix.keys().drop([column_for_corr_check]).to_list()
    result_col = [col for col in col_list if abs(train_df[column_for_corr_check].corr(train_df[col])) >= threshold]
    return result_col

def scale_data(df=None, columns=None, scaler=StandardScaler, **kwargs):
    if kwargs is not None:
        scl = scaler(**kwargs)
    else:
        scl = scaler()
    for col in columns:
        df[col] = scl.fit_transform(df[[col]])
    

In [292]:
categorical_columns = None
numerical_columns = None

In [293]:
categorical_columns = train_df.columns[train_df.dtypes=="O"].to_list()
numerical_columns = train_df.columns[train_df.dtypes!="O"].to_list()

In [294]:
impute_categorical_data(train_df, columns=categorical_columns, strategy="most_frequent")
impute_numerical_data(train_df, columns=numerical_columns, strategy="median")

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)


In [278]:
corr_matrix = get_corr_matrix(train_df)

In [279]:
x_train = train_df[get_cols_with_threshold_corr(train_df)]
y_train = train_df[["SalePrice"]]

In [178]:
x_train

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,diff_yr_sold_built
0,7.0,2003.0,2003.0,856.0,856.0,1710.0,2.0,8.0,2.0,548.0,5.0
1,6.0,1976.0,1976.0,1262.0,1262.0,1262.0,2.0,6.0,2.0,460.0,31.0
2,7.0,2001.0,2002.0,920.0,920.0,1786.0,2.0,6.0,2.0,608.0,7.0
3,7.0,1915.0,1970.0,756.0,961.0,1717.0,1.0,7.0,3.0,642.0,91.0
4,8.0,2000.0,2000.0,1145.0,1145.0,2198.0,2.0,9.0,3.0,836.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...
1455,6.0,1999.0,2000.0,953.0,953.0,1647.0,2.0,7.0,2.0,460.0,8.0
1456,6.0,1978.0,1988.0,1542.0,2073.0,2073.0,2.0,7.0,2.0,500.0,32.0
1457,7.0,1941.0,2006.0,1152.0,1188.0,2340.0,2.0,9.0,1.0,252.0,69.0
1458,5.0,1950.0,1996.0,1078.0,1078.0,1078.0,1.0,5.0,1.0,240.0,60.0


In [179]:
y_train.values

array([[208500.],
       [181500.],
       [223500.],
       ...,
       [266500.],
       [142125.],
       [147500.]])

In [262]:
scale_data(x_train, columns=x_train.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [181]:
x_train

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,diff_yr_sold_built
0,0.651479,1.050994,0.878668,-0.459303,-0.793434,0.370333,0.789741,0.912210,0.311725,0.351000,-1.043259
1,-0.071836,0.156734,-0.429577,0.466465,0.257140,-0.482512,0.789741,-0.318683,0.311725,-0.060731,-0.183465
2,0.651479,0.984752,0.830215,-0.313369,-0.627826,0.515013,0.789741,-0.318683,0.311725,0.631726,-0.977121
3,0.651479,-1.863632,-0.720298,-0.687324,-0.521734,0.383659,-1.026041,0.296763,1.650307,0.790804,1.800676
4,1.374795,0.951632,0.733308,0.199680,-0.045611,1.299326,0.789741,1.527656,1.650307,1.698485,-0.944052
...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.071836,0.918511,0.733308,-0.238122,-0.542435,0.250402,0.789741,0.296763,0.311725,-0.060731,-0.944052
1456,-0.071836,0.222975,0.151865,1.104925,2.355701,1.061367,0.789741,0.296763,0.311725,0.126420,-0.150396
1457,0.651479,-1.002492,1.024029,0.215641,0.065656,1.569647,0.789741,1.527656,-1.026858,-1.033914,1.073157
1458,-0.795151,-0.704406,0.539493,0.046905,-0.218982,-0.832788,-1.026041,-0.934130,-1.026858,-1.090059,0.775536


In [263]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [121]:
model = SVR()

In [122]:
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [112]:
categorical_columns = train_df.columns[train_df.dtypes=="O"].to_list()
numerical_columns = train_df.columns[train_df.dtypes!="O"].to_list()

In [113]:
impute_categorical_data(train_df, columns=categorical_columns, strategy="most_frequent")
impute_numerical_data(train_df, columns=numerical_columns, strategy="median")

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)


In [118]:
scale_data(x_train, columns=x_train.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [264]:
test_df = pd.read_csv("test.csv")
test_df["diff_yr_sold_built"] = abs(test_df["YrSold"] - test_df["YearBuilt"])
categorical_columns_test = test_df.columns[test_df.dtypes=="O"].to_list()
numerical_columns_test = test_df.columns[test_df.dtypes!="O"].to_list()

impute_categorical_data(test_df, columns=categorical_columns_test, strategy="most_frequent")
impute_numerical_data(test_df, columns=numerical_columns_test, strategy="median")

x_test = test_df[get_cols_with_threshold_corr(train_df)]
scale_data(x_test, columns=x_test.columns)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [129]:
model = SVR()
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [130]:
model.predict(x_test)

array([162917.21170412, 162944.57330969, 163221.27061169, ...,
       163054.10030806, 162988.85282156, 163323.27840758])

In [131]:
sub = pd.read_csv("sample_submission.csv")

In [132]:
sub["SalePrice"] = model.predict(x_test)

In [134]:
sub.to_csv("result_svr_default_param.csv", index=False)

In [243]:
model = RandomForestRegressor(n_estimators=15, bootstrap=False, max_depth=7)

In [244]:
model.fit(x_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=7, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=15, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [245]:
from sklearn.metrics import mean_squared_error
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

  


In [246]:
mean_squared_error(y_pred, y_val.values, squared=False)

36026.90615287113

In [247]:
model.score(x_val, y_val)

0.7837358200214446

In [248]:
model.score(x_train, y_train)

0.9367784435102892

In [143]:
sub["SalePrice"] = model.predict(x_test)

In [144]:
sub.to_csv("result_ranf_n_em_1000.csv", index=False)

In [145]:
model.score(x_train, y_train)

0.9791910648339325

In [147]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [216]:
model = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [i for i in range(5,50,100)], 
              'max_depth': [i for i in range(1,5)],
              'bootstrap': [True],
             }
gscv = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, verbose=2, n_jobs=-1)

In [183]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=69)

In [265]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((1460, 11), (292, 11), (1460, 1), (292, 1))

In [217]:
gscv.fit(x_train, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  38 out of  40 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.7s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_job

In [218]:
gscv.best_params_

{'bootstrap': True, 'max_depth': 4, 'n_estimators': 5}

In [188]:
result = gscv.predict(x_val)

In [157]:
sub["SalePrice"] = result

In [158]:
sub.to_csv("result_ranf_gscv.csv", index=False)

In [266]:
x_train.shape

(1460, 11)

In [316]:

impute_categorical_data(train_df, columns=train_df.columns[train_df.dtypes=="O"].to_list(), strategy="most_frequent")
impute_numerical_data(train_df, columns=train_df.columns[train_df.dtypes!="O"].to_list(), strategy="median")
label_encode(df=train_df, columns=train_df.columns[train_df.dtypes=="O"].to_list())
y_train = train_df[["SalePrice"]]
x_train = train_df[train_df.keys().drop(["SalePrice"])]
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model.fit(x_train, y_train)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)


  import sys


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [319]:
score_data = list(zip(x_train.columns, model.feature_importances_))
for feature, score in sorted(score_data, key=operator.itemgetter(1), reverse=True):
    print("feature: {}, importance: {}".format(feature, score))

feature: OverallQual, importance: 0.5851911466237752
feature: GrLivArea, importance: 0.10758505057470152
feature: TotalBsmtSF, importance: 0.0419994249993406
feature: 2ndFlrSF, importance: 0.036920535867351785
feature: BsmtFinSF1, importance: 0.028477528332404318
feature: 1stFlrSF, importance: 0.020777247578010544
feature: GarageCars, importance: 0.019615792480693253
feature: GarageArea, importance: 0.014453918653861501
feature: LotArea, importance: 0.013256918475184891
feature: Neighborhood, importance: 0.007878778410371835
feature: TotRmsAbvGrd, importance: 0.007491242763359154
feature: YearBuilt, importance: 0.00662472697895452
feature: YearRemodAdd, importance: 0.006067828153246756
feature: diff_yr_sold_built, importance: 0.0057193973094600056
feature: LotFrontage, importance: 0.005542108523302345
feature: MasVnrArea, importance: 0.005412010435908252
feature: BsmtUnfSF, importance: 0.004666175241944078
feature: OverallCond, importance: 0.004387240605106141
feature: WoodDeckSF, impo

In [302]:
x_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,diff_yr_sold_built
0,1.0,60.0,RL,65.0,8450.0,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,2.0,2008.0,WD,Normal,5.0
1,2.0,20.0,RL,80.0,9600.0,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,5.0,2007.0,WD,Normal,31.0
2,3.0,60.0,RL,68.0,11250.0,Pave,Grvl,IR1,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,9.0,2008.0,WD,Normal,7.0
3,4.0,70.0,RL,60.0,9550.0,Pave,Grvl,IR1,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,2.0,2006.0,WD,Abnorml,91.0
4,5.0,60.0,RL,84.0,14260.0,Pave,Grvl,IR1,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,12.0,2008.0,WD,Normal,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,RL,62.0,7917.0,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,8.0,2007.0,WD,Normal,8.0
1456,1457.0,20.0,RL,85.0,13175.0,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,2.0,2010.0,WD,Normal,32.0
1457,1458.0,70.0,RL,66.0,9042.0,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,Gd,GdPrv,Shed,2500.0,5.0,2010.0,WD,Normal,69.0
1458,1459.0,20.0,RL,68.0,9717.0,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,Gd,MnPrv,Shed,0.0,4.0,2010.0,WD,Normal,60.0


In [318]:
x_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive