In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from regressors import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train_filled_na.csv')
train_ord = pd.read_csv('train_filled_na_ord.csv')

In [3]:
train.drop('Id', axis = 1, inplace=True)
train_ord.drop('Id', axis = 1, inplace=True)

In [4]:
allZeroCols = train.columns[(train == 0).all()].to_list()
train = train.drop(allZeroCols, axis=1)

# Outlier Detection

## Train w/ dummies

In [5]:
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(train)

In [6]:
mask = yhat != -1

In [7]:
train = train.loc[mask, :]

In [8]:
# X = train.loc[:, train.columns != 'SalePrice']
# y = train['SalePrice']

## Train w/out dummies

In [9]:
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(train_ord)

In [10]:
mask = yhat != -1

In [11]:
train_ord = train_ord.loc[mask, :]

In [12]:
# X = train_ord.loc[:, train_ord.columns != 'SalePrice']
# y = train_ord['SalePrice']

# Feature Selection

## Select K Best

In [13]:
def get_k_best(k, X, y):
    k_best = SelectKBest(f_regression, k=k).fit(X, y)
    X[X.columns[k_best.get_support()]]

    k_best_scores = zip(X.columns, k_best.scores_)
    sorted_coef_importance = sorted(list(k_best_scores),key= lambda x: x[1],reverse=True)
    sorted_coef_importance
    print(sorted_coef_importance)
    f, ax = plt.subplots(figsize=(13, 9))
    sns.barplot([x[0] for x in sorted_coef_importance[:k]], [x[1] for x in sorted_coef_importance[:k]])
    plt.xticks(rotation=90)

In [None]:
get_k_best(50, train.loc[:, 'SalePrice'])

## Model Feature Selection

In [None]:
def getFeatureImportance(num, cols, features_importances_, sort):
    
    feature_importances = zip(cols, features_importances_)
    sorted_importance = sorted(list(feature_importances),key= lambda x: abs(x[1]),reverse=False if sort == 'ascending' else True)

    f, ax = plt.subplots(figsize=(13, 9))
    sns.barplot([x[0] for x in sorted_importance[:num]], [x[1] for x in sorted_importance[:num]])
    plt.xticks(rotation=90)
    
    return sorted_importance

### Random Forest

In [None]:
model = RandomForestRegressor()
params = {'n_estimators':[100], 'max_features':['auto', 'sqrt']}
gridRf = GridSearchCV(model, param_grid=params, cv=5)
gridRf.fit(X, y)

In [None]:
gridRf.cv_results_

In [None]:
gridRf.best_params_

In [None]:
gridRf.best_score_

In [None]:
getFeatureImportance(50, train.columns, gridRf.best_estimator_.feature_importances_, '')

### Ridge Regression

In [None]:
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(train.loc[:, train.columns != 'SalePrice'])
X_Scaled = pd.DataFrame(X_Scaled, columns = train.columns[train.columns != 'SalePrice'])

In [None]:
model = Ridge()
params = {'alpha':[0.1,1,10,50,75,100,1000,2000]}
gridRidge = GridSearchCV(model, param_grid=params, cv=5)
gridRidge.fit(X_Scaled, y)

In [None]:
gridRidge.cv_results_

In [None]:
gridRidge.best_params_

In [None]:
gridRidge.best_score_

In [None]:
gridRidge.best_estimator_.coef_

In [None]:
getFeatureImportance(50, train.columns, gridRidge.best_estimator_.coef_,'descending')

### Lasso Regression

In [None]:
model = Lasso()
params = {'alpha':[0.1,1,10,50,75,100,1000,2000],'max_iter':[1000]}
gridLasso = GridSearchCV(model, param_grid=params, cv=5)
gridLasso.fit(X_Scaled, y)

In [None]:
gridLasso.cv_results_

In [None]:
gridLasso.best_params_

In [None]:
gridLasso.best_score_

In [None]:
getFeatureImportance(50, train.columns, gridLasso.best_estimator_.coef_, '')

## Value Counts

In [None]:
train_org = pd.read_csv('train.csv')

In [None]:
def getValueCounts(cols, data):
    f, axes = plt.subplots(round(len(cols)/2)+1, 2, figsize=(10,80))
    rowIdx = 0
    colIdx = 0
    for col in cols:
        if colIdx > 1:
            rowIdx = rowIdx + 1
            colIdx = 0
        value_counts = data[col].value_counts()
        ax = sns.barplot(x=value_counts.index, y=value_counts, data=data, ax=axes[rowIdx,colIdx]).set_title(col)
        colIdx += 1
    f.tight_layout()

In [None]:
cat_cols = train_org.columns[train_org.dtypes == 'object'] | train_org.columns[train_org.columns == 'MSSubClass']
getValueCounts(cat_cols, train_org)

In [None]:
colsToDrop = ['BldgType','BsmtCond','BsmtFinType2','CentralAir','Condition1','Condition2','Electrical',
              'ExterCond','Functional','GarageCond','GarageQual','Heating','LandContour','LandSlope',
              'MiscFeature','PavedDrive','RoofMatl','Street','Utilities','SaleType','MSSubClass']

In [None]:
for col in colsToDrop:
    train = train[train.columns.drop(list(train.filter(regex=col)))]


In [None]:
X = train.loc[:, train.columns != 'SalePrice']

In [None]:
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(train.loc[:, train.columns != 'SalePrice'])
X_Scaled = pd.DataFrame(X_Scaled, columns = train.columns[train.columns != 'SalePrice'])
y = train['SalePrice']

## Random Forest on Subset

In [None]:
model = RandomForestRegressor()
params = {'n_estimators':[100], 'max_features':['auto', 'sqrt']}
gridRf = GridSearchCV(model, param_grid=params, cv=5)
gridRf.fit(X, y)

In [None]:
gridRf.best_score_

## Ridge on Subset

In [None]:
model = Ridge()
params = {'alpha':[0.1,1,10,50,75,100,1000,2000],'max_iter':[1000]}
gridRidge = GridSearchCV(model, param_grid=params, cv=5)
gridRidge.fit(X_Scaled, y)

In [None]:
gridRidge.best_score_

## Lasso on Subset

In [None]:
model = Lasso()
params = {'alpha':[0.1,1,10,50,75,100,1000,2000],'max_iter':[1000]}
gridLasso = GridSearchCV(model, param_grid=params, cv=5)
gridLasso.fit(X_Scaled, y)

In [None]:
gridLasso.best_score_

## Correlation between variables

### Correlation Matrix

In [None]:
corr = X.corr()

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
mostCorrelatedFeat = get_top_abs_correlations(corr, 1000)


In [None]:
mostCorrelatedFeat.head(50)

In [None]:
colsToDrop = ['Exterior2nd']
for col in colsToDrop:
    train = train[train.columns.drop(list(train.filter(regex=col)))]

In [None]:
X = train.loc[:, train.columns != 'SalePrice']

### Variance Inflation Factor

In [None]:
constant = np.ones((X.shape[0], 1))
X['constant'] = constant

In [None]:
vifs = zip(X.columns, [variance_inflation_factor(X.values, i) for i in range(len(X.columns))])

In [None]:
list(vifs)

In [None]:
X[['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']]

In [None]:
X[['GrLivArea','1stFlrSF','2ndFlrSF','LowQualFinSF']]

In [None]:
train = train.drop(['GrLivArea','TotalBsmtSF','OverallQual','GarageArea','PoolArea','PoolQC'], axis=1)

In [None]:
X = train.loc[:,train.columns != 'SalePrice']

### Variance Inflation Factor on Subset

In [None]:
vifs = zip(X.columns, [variance_inflation_factor(X.values, i) for i in range(len(X.columns))])

In [None]:
list(vifs)

### Correlation Matrix on Subset

In [None]:
corr = X.corr()

In [None]:
mostCorrelatedFeat = get_top_abs_correlations(corr, 1000)
mostCorrelatedFeat.head(50)

In [None]:
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(train.loc[:, train.columns != 'SalePrice'])
X_Scaled = pd.DataFrame(X_Scaled, columns = train.columns[train.columns != 'SalePrice'])
y = train['SalePrice']

In [None]:
model = Lasso()
params = {'alpha':[0.1,1,10,50,75,100,1000,2000],'max_iter':[1000]}
gridLasso = GridSearchCV(model, param_grid=params, cv=5)
gridLasso.fit(X_Scaled, y)


In [None]:
list(zip(X_Scaled.columns,gridLasso.best_estimator_.coef_))

In [None]:
stats.summary(gridLasso.best_estimator_, X_Scaled, y)