In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor, OLSInfluence
from scipy.stats import kurtosis, skew, boxcox
from regressors import stats
from numpy import mean
from numpy import std
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
train = pd.read_csv('train_filled_na_ord.csv')
test = pd.read_csv('test_filled_na_ord.csv')

In [None]:
train.drop('Id', axis = 1, inplace=True)
test.drop('Id', axis = 1, inplace=True)

# Outlier Detection

In [None]:
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(train.loc[:, train.columns != 'SalePrice'])
X_Scaled = pd.DataFrame(X_Scaled, columns = train.columns[train.columns != 'SalePrice'])
y = train['SalePrice']

In [None]:
X_sm = sm.add_constant(X_Scaled)
model = sm.OLS(list(y),X_sm)
results = model.fit()

In [None]:
C, P = OLSInfluence(results).cooks_distance

In [None]:
outliers = pd.DataFrame(C[C > 1])
outliers.columns = ["Cook's Distance"]

In [None]:
# data_mean, data_std = mean(train), std(train)

In [None]:
# cut_off = data_std * 3
# lower, upper = data_mean - cut_off, data_mean + cut_off

# outliers = {}
# for col in ['LotFrontage','LotArea','MasVnrArea','YearBuilt','YearRemodAdd','BsmtFinSF1','BsmtFinSF2',
#             'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea','WoodDeckSF',
#            'OpenPorchSF','EnclosedPorch']:
#     m = train[col].between(lower[col], upper[col], inclusive=True)
#     outliers[col] = train[~m]
    
    

In [None]:
# outlier_indices = set()
# for col in outliers:
#     print(col, len(outliers[col]))
#     for idx in list(outliers[col].index):
#         #print(idx)
#         outlier_indices.add(idx)
        

In [None]:
train = train.drop(outliers.index, axis=0)

# Dropping Columns with all Zeroes

In [None]:
allZeroCols = train.columns[(train == 0).all()].to_list()
train = train.drop(allZeroCols, axis=1)
test = test.drop(allZeroCols, axis=1)

# Model Testing

In [None]:
def getFeatureImportance(num, cols, features_importances_, sort):
    
    feature_importances = zip(cols, features_importances_)
    sorted_importance = sorted(list(feature_importances),key= lambda x: abs(x[1]),reverse=False if sort == 'ascending' else True)

    f, ax = plt.subplots(figsize=(13, 9))
    sns.barplot([x[0] for x in sorted_importance[:num]], [x[1] for x in sorted_importance[:num]])
    plt.xticks(rotation=90)
    
    return sorted_importance

In [None]:
X = train.loc[:, train.columns != 'SalePrice']
y = train['SalePrice']

## Random Forest

In [None]:
model = RandomForestRegressor()
params = {'n_estimators':[1000], 'max_features':['sqrt'], 'criterion':['mse']}
gridRf = GridSearchCV(model, param_grid=params, cv=5)
gridRf.fit(X, y)
print(gridRf.cv_results_)
print(gridRf.best_params_)
print(gridRf.best_score_)

In [None]:
getFeatureImportance(30, X.columns, gridRf.best_estimator_.feature_importances_, sort='descending')

In [None]:
gridRf.score(X, y)

## Gradient Boosting Regressor

In [None]:
model = GradientBoostingRegressor()
params = {'n_estimators': [500,600,700],
          'max_depth':[1,2,3,4,5],
          'max_features':['sqrt'],
          'subsample':[1,0.9,0.8,0.7], 
          'loss':['ls']}

gridGbr = GridSearchCV(model, param_grid=params, cv=5)
gridGbr.fit(X, y)
print(gridGbr.cv_results_)
print(gridGbr.best_params_)
print(gridGbr.best_score_)

In [None]:
getFeatureImportance(30, X.columns, gridGbr.best_estimator_.feature_importances_, sort='descending')

In [None]:
gridGbr.score(X, y)

# Test Predictions

In [None]:
gridGbr_predictions = gridGbr.predict(test)
gridGbr_predictions = pd.DataFrame(gridGbr_predictions)
gridGbr_predictions.index = gridGbr_predictions.index + 1461
gridGbr_predictions = gridGbr_predictions.rename(columns={0:'SalePrice'})
gridGbr_predictions = gridGbr_predictions.rename_axis('Id')


In [None]:
#gridGbr_predictions.to_csv('submission12.csv')