In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder

from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.neighbors import KNeighborsClassifier


from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor


import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config

set_config(display='diagram')

import datetime


# Read CSV

In [3]:
data = pd.read_csv('train.csv')
df = data.copy()

# Preprocessing

In [4]:
encoted_column = df[['Neighborhood', 'SalePrice']].groupby(
    ['Neighborhood'], as_index=False).mean().sort_values(by='SalePrice').reset_index(drop=True)
ord_col = encoted_column.drop('SalePrice', axis=1)
ord_col = list(ord_col.iloc[:, 0])
df['Neighborhood'] = OrdinalEncoder(
    categories=[ord_col]).fit_transform(df[['Neighborhood']])

#numless
df_corr = abs(df.corr())
corrSalesprice = df_corr.sort_values('SalePrice',ascending=False)
num_less20 = pd.DataFrame(corrSalesprice['SalePrice'] < 0.2)
num_less20 = list(num_less20[num_less20.values == True].index)
df = df.drop(num_less20, axis=1)

# Metrics

In [5]:
def custom_metric(y, y_pred):
    return np.sqrt(mean_squared_log_error(y, y_pred))

custom_score = make_scorer(custom_metric, greater_is_better = False)

# Modelisation

In [7]:
X = df.drop('SalePrice', axis = 1)
y = df.SalePrice

cat_cols = X.select_dtypes(exclude = [np.number])
num_cols = X.select_dtypes(include = [np.number])

cat_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(handle_unknown="ignore"))
#cat_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OrdinalEncoder(handle_unknown = 'use_encoded_value',unknown_value = -1))
num_pipe = make_pipeline(SimpleImputer(strategy = 'median'), RobustScaler())

preprocessing = make_column_transformer((cat_pipe, cat_cols.columns), (num_pipe, num_cols.columns))


model_1 = RidgeCV()
model_2 = LassoCV()
model_3 = RandomForestRegressor()
model_4 = SVR()


model_final = StackingRegressor([('RidgeCV', model_1),
                           ('LassoCV', model_2),
                           ('RandomForestRegressor', model_3),
                                 ('SVR', model_4),
                                ],
                                
                         final_estimator = RidgeCV())

model = make_pipeline(preprocessing, model_final)

# Cross Validation

In [8]:
cross_val_score(model, X, y, cv=5, scoring=custom_score, error_score="raise").mean()

-0.13551688421347868

In [9]:
cross_val_score(model, X, y, cv=5, scoring='r2', error_score="raise").mean()

0.8657478449258595

In [None]:
#params = {
 #       'randomforestregressor__n_estimators': [50,100,150],
 #       'randomforestregressor__bootstrap': [True, False]
 #       }

#grid = GridSearchCV(model, param_grid = params, cv=5, scoring = custom_score)

#grid.fit(X, y)

#print(grid.best_params_)
#print(f"Best Score = {grid.best_score_}")
#grid_model = grid.best_estimator_

# Final Train

In [10]:
model_fit = model.fit(X,y)

# Preprocessing Test

In [13]:
def transform(df, ord_col, num_less20):
    
    #Encode Neighborhood    
    df['Neighborhood'] = OrdinalEncoder(categories=[ord_col]).fit_transform(df[['Neighborhood']])
    
    #drop cat which have more than 10 unique values
    df = df.drop(num_less20, axis = 1)    
    return df
    
X_test = pd.read_csv('test.csv')    
X_test1 = transform(X_test, ord_col, num_less20)     

# Submission

In [14]:
submission = pd.DataFrame({'Id':X_test['Id'],'SalePrice': model_fit.predict(X_test1)})

In [15]:
submission.to_csv('submission2.csv', index=False)