# Ames Housing Machine Learning: Predicting Home Prices

###  Importing Necessaty Libraries to be Used For EDA and Modeling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msno
from pandas.plotting import scatter_matrix

In [None]:
pd.set_option("display.max_columns",999) # to show all the columns in the output 
pd.set_option("display.max_rows",999) # to show all the rows in the output 
plt.rcParams['figure.figsize'] = [8.0, 5.0] # setting plotting settings for the remaining of the notebook
plt.rcParams['figure.dpi'] = 140 #setting pixels

###  Reading the Data

In [None]:
homes=pd.read_csv(r"/Users/tiko/Documents/Machine_Learning_Ames_Housing_Kaggle/Ames_HousePrice.csv")
housing2=pd.read_csv(r"/Users/tiko/Documents/Machine_Learning_Ames_Housing_Kaggle/Ames_Real_Estate_Data.csv")


# ** housing2 dataset is an additional dataset that will be used if it contains usfull information **

# EDA For First Dataset (Ames_HousePrice.csv) --> homes

In [None]:
homes.head()
'''

Using different Machine Learning models, we will attempt to predict Sales Price of homes based on data collected
from sales between the years 2007-2010. 


'''

In [None]:
homes.info()

In [None]:
homes.describe()

In [None]:
homes.shape

# 2580 rows and 82 columns

In [None]:

homes['YrSold'].value_counts().plot.bar();

##Bar plot based on Year Sold

### Exploring Distribution of our Target (SalePrice)

In [None]:
def target_dist():
    '''
    
    Analyzing how our target column (SalePrice) is distributed. For linear models, we need our target variable to 
    follow the gaussian distribution.
    
    '''

    fig, (ax1, ax2) = plt.subplots(1,2, sharex=False, sharey=True)

    sns.histplot(homes['SalePrice'],ax=ax1,bins=30);
    sns.histplot(np.log(homes['SalePrice']),ax=ax2,bins = 30);

    ax1.set_title("Sale Price - Right Skewed")
    ax1.set_ylabel("Count")
    ax1.set_xlabel("Sale Price in thousands ($)")

    ax2.set_title("Sale Price - Log Transformation")
    ax2.set_ylabel("Count")
    ax2.set_xlabel("Sale Price - Logged")
    

target_dist()
#How is our output column distributed (currently skewd to the right -->log transform)

## Exploring Columns and Column Types in our dataset

In [None]:
# Deeper look into type of columns
print("Numeric Columns: ", '\n')
print(homes.select_dtypes(exclude='object').columns, '\n')# all numerical (non-object) columns

print('Object Columns: ','\n')
print(homes.select_dtypes(include='object').columns, '\n')# All object columns (string and/or mixed)


In [None]:

homes['MSZoning'].value_counts().plot.bar();
#Some categorical Columns are imbalanced (will need to combine minority classes)

In [None]:
df_object = homes.select_dtypes(include='object') 
#creating DF with all non-numeric columns (potentially categorical) to check class balance. 
df_numeric = homes.select_dtypes(exclude = 'object')

In [None]:
def check_class_imbalance(df, nrow, ncol):
    '''
    Function creates subplots of all 'categorical' columns in our dataframe and outputs count of occurance for all 
    classes (categorical) in the dataframe. 
    
    '''

    fig, ax  = plt.subplots(figsize = (15, 50))
    for n, col in enumerate(df.columns):
        plt.subplot(nrow, ncol,n+1) #specify number of rows and number of columns for graph output
        df[col].value_counts().plot.bar()
        plt.title(str(col))
    plt.tight_layout()



check_class_imbalance(df_object, 11,4)



'''
Based on the subplots output, we can see that we have many categorical input variables that have imbalanced 
classes. One way to solve this problem is to combine  minority classes into one. 

This is tricky as we also do not want to lose additional information detrimental to our output. 


'''



In [None]:
def check_dist(df, nrow, ncol):
    '''
    Function creates subplots of all 'numerical' columns in our dataframe and outputs distribution
    of all numerical datasets. Normally distributed input variables help improve our ML model.
    The more normally distributed, the better. 
    
    
    ***USE Discretization Transforms if necessry ***
    
    '''

    fig, ax  = plt.subplots(figsize = (15, 50))
    for n, col in enumerate(df.columns):
        plt.subplot(nrow, ncol,n+1) #specify number of rows and number of columns for graph output
        df[col].plot.hist(bins = 30)
        plt.title(str(col))
    plt.tight_layout()



check_dist(df_numeric, 11,4)

In [None]:
def target_correlation(df, nrow, ncol):
    '''
    creating subplots of all numerical columns to find correlation between the input variables and our
    target, 'SalePrice'. 
    
    
    '''
    fig, ax  = plt.subplots(figsize = (15, 50))

    for i, col in enumerate(df.columns):
        plt.subplot(nrow,ncol,i+1) 
        plt.title(str(col) + ' & Sales Price Correlation')
        sns.scatterplot(data = df, x = df[col], y = df['SalePrice'])
    plt.tight_layout()


In [None]:
target_correlation(df_numeric,10,4)

## Exploring NA Values

In [None]:
pd.DataFrame(homes.isna().sum())
'''
Some columns will be removed right away due to reasons exmplained below:


More than 90% NA values:

PoolQC	2571
Fence	2055
MiscFeature	2483
Alley	2412



Contenders for NA impputation. 

*** NOTE *** 

Some NA's are due to the feature not being present in the property (ie. no fireplace)

FireplaceQu	1241 (Just one example, other examples will be found as EDA progresses)

*** NOTE *** 

LotFrontage	462
MasVnrType	14
MasVnrArea	14
BsmtQual	69
BsmtCond	69
BsmtExposure	71
BsmtFinType1	69
BsmtFinSF1	1
BsmtFinType2	70
BsmtFinSF2	1
BsmtUnfSF	1
TotalBsmtSF	1
Electrical	1
BsmtFullBath	2
BsmtHalfBath	2
GarageType	127
GarageYrBlt	129
GarageFinish	129
GarageCars	1
GarageArea	1
GarageQual	129
GarageCond	129
'''




In [None]:
'''
NA Values Visualized
darker palette ==  more na values
'''

plt.figure(figsize=(20,15))
sns.heatmap(homes.isna(),cmap="BuPu",cbar=False);

In [None]:
'''
NA Values for second DataSet ('housing2')
** We MAY use this dataset if it improves our ML model  **

'''

plt.figure(figsize=(20,15))
sns.heatmap(housing2.isna(),cmap="BuPu",cbar=False);

In [None]:
housing2.head()

In [None]:
homes[homes['LotFrontage'].isnull()]

In [None]:
'''
Only 9 homes have a pool.  All pool related columns will be dropped as to not interfere with our model. 

'''

homes[homes['PoolArea']>0] 





In [None]:
newdf = homes[homes["Fireplaces"]==1]
newdf['FireplaceQu'].value_counts().plot.bar();  
# Skewed to GD and TA (AVERAGE) + 1241 NA VALUES. Minority classes 
#(Fa, Po,  will be combined together while Ex will be combined with Gd)


'''
Ex --> Excelent
Gd -- > good
Ta --> Average
Fa --> Fair - Prefabricated Fireplace in basement
Po -- Poor - Ben Franklin Stove
Na --> No Fireplace


Combined Classes: 
Gd == Gd + Ex
TA == TA + Fa
Po == Po
Na = no Fireplace

'''

In [None]:
homes["FireplaceQu"].value_counts() # count of fireplaces (1241 NA VALUES)

In [None]:
def check_count(col):
    return homes[col].value_counts().plot.bar()

check_count('MSSubClass');

In [None]:
check_count('OverallQual');

In [None]:

'''Heatmap for all numeric columns with correlation of more than 50%'''

plt.figure(figsize  = (24,12))
num_val=homes.select_dtypes(exclude = 'object')
num_val.drop("Unnamed: 0",axis = 1, inplace = True)
cor = num_val.corr(method = "spearman")
# sns.heatmap(cor, cmap = "coolwarm", )
v = cor[(cor>0.50)].dropna(axis = 1,how = 'all')


sns.heatmap(v)

In [None]:
# attempting to see all numeric columns in our features dataframe
df_numeric.isna().sum()

df_numeric_new = df_numeric.apply(lambda x: x.fillna(x.mean()),axis=1) 
# replacing all na's with average of column. this will not be the method of imputing when we get to ML modeling.
# This is only to conduct eda.
df_numeric_new.isna().sum()



In [None]:
num_features['MasVnrType'].value_counts()

In [None]:
# Exploring Multicolinearity with VIF (Variance Inflation Factor)
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif


In [None]:
#Creating a function to calculate VIF (Variance Inflation Factor to determine multicolinearity levels in the 
#numeric features of our dataset)

def calc_vif(dataset):
    viff = pd.DataFrame()
    viff["Features"] = dataset.columns
    viff["VIF_Score"] = [vif(dataset.values, i) for i in range(dataset.shape[1])] #taking the sqrt to improve
    #readibility
    return viff[viff["VIF_Score"]>30]


calc_vif(df_numeric_new)


#columns with High VIF Score:

# 7.  YearBuilt	154.563646
# 8	YearRemodAdd	147.719689
# 10	BsmtFinSF1	inf
# 11	BsmtFinSF2	inf
# 12	BsmtUnfSF	inf
# 13	TotalBsmtSF	inf
# 14	1stFlrSF	inf
# 25	GarageYrBlt	163.295516
# 36	YrSold	157.303896


In [None]:
df_numeric_new.head()

In [None]:
df_numeric_new['GarageArea'].value_counts()
df_numeric['GarageArea'].value_counts()

In [None]:
print(df_numeric_new.corr())

In [None]:
plt.figure(figsize=(20,12));
sns.heatmap(df_numeric.corr(), cmap="coolwarm", cbar=False, annot=True);
#columns with High VIF Score:

# 7.  YearBuilt	154.563646
# 8	YearRemodAdd	147.719689
# 10	BsmtFinSF1	inf
# 11	BsmtFinSF2	inf
# 12	BsmtUnfSF	inf
# 13	TotalBsmtSF	inf
# 14	1stFlrSF	inf
# 25	GarageYrBlt	163.295516
# 36	YrSold	157.303896

In [None]:
df_numeric_new.drop('TotRmsAbvGrd', inplace = True, axis = 1)

In [None]:
calc_vif(df_numeric_new)

In [None]:
sns.scatterplot(data = num_feat_target, x = "GrLivArea", y = "target");

In [None]:
homes['GarageArea'].isna().sum()

In [None]:
def scatter_rel(data, x):
    sns.scatterplot(data = data, x = x, y = 'SalePrice')


In [None]:
plt.subplot(222)
scatter_rel(df_numeric,"GarageArea"); 


In [None]:
homes["GarageArea"].value_counts().sort_values() #we have 127 garage areas with 0 squarefeet
scatter_rel(df_numeric_new, "GarageArea")

In [None]:
scatter_rel(df_numeric, "LotArea"); 

In [None]:
scatter_rel(df_numeric_new, "LotArea"); 

In [None]:
scatter_rel("GrLivArea"); 

In [None]:
scatter_rel(df_numeric, "LotArea"); 

In [None]:
scatter_rel('1stFlrSF'); #highly correlated

In [None]:
scatter_rel(df_numeric,
            'TotalBsmtSF')

In [None]:
scatter_rel(df_numeric_new,
            'TotalBsmtSF')

In [None]:
homes['TotalBsmtSF'].isna().sum()

## Exploring Categorical Columns With Target

In [None]:
homes.head()

In [None]:
sns.boxplot(data = homes, x ="LotConfig", y ="SalePrice");

In [None]:
homes["LotConfig"].value_counts().plot.bar();

In [None]:
def box_plot(data,x,hue = None, y = 'SalePrice', orient = 'v'):
    plt.figure(figsize=(20,12));
    plt.title('Average Sales Price & '+ str(x))
    return sns.boxplot(data = data, x = x, hue = hue, y = y, orient = orient);

box_plot(data = df_numeric,x = "OverallQual");

In [None]:
def bar_plot(data,x,hue = None, y = 'SalePrice', orient = 'v'):
    plt.figure(figsize=(20,12));
    plt.title('Average Sales Price & '+ str(x))
    return sns.barplot(data = data, x = x, hue = hue, y = y, orient = orient);

bar_plot(data = df_numeric,x = "OverallQual");

In [None]:
bar_plot(data = df_numeric,x = "OverallCond"); 
# Average Salesprice for homes with condition of '5', is higher than all other conditions that are higher.
# this could mean that some people prefer to buy homes in better neighborhoods with lower condiotion score.
# We can further explore this hypothesis below.

In [None]:
box_plot(data = homes,x = "Neighborhood", hue = 'OverallCond'); 

In [None]:
sns.barplot(data = homes[homes['SalePrice']>200000], x = 'SalePrice', y = "Neighborhood");
# We want to see how homes with higher price-point, more than $200,000, in terms of neighborhood.
plt.tight_layout() 

In [None]:
sns.barplot(data = homes[homes['SalePrice']>300000], x = 'OverallCond', y = "Neighborhood");
# filtering homes with price of more than $200,000 will enable us to see why average price of 
#homes with 'condition' of '5' occur more than homes with better 'condition' 


In [None]:
sns.barplot(data = homes[homes['SalePrice']<300000], x = 'OverallCond', y = "Neighborhood");

In [None]:
box_plot(df_numeric, "GarageCars");

In [None]:
box_plot(df_numeric, "MSSubClass");
#MSSubClass: Identifies the type of dwelling involved in the sale.	

#         20	1-STORY 1946 & NEWER ALL STYLES
#         30	1-STORY 1945 & OLDER
#         40	1-STORY W/FINISHED ATTIC ALL AGES
#         45	1-1/2 STORY - UNFINISHED ALL AGES
#         50	1-1/2 STORY FINISHED ALL AGES
#         60	"LotFrontage"
#         70	2-STORY 1945 & OLDER
#         75	2-1/2 STORY ALL AGES
#         80	SPLIT OR MULTI-LEVEL
#         85	SPLIT FOYER
#         90	DUPLEX - ALL STYLES AND AGES
#        120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
#        150	1-1/2 STORY PUD - ALL AGES
#        160	2-STORY PUD - 1946 & NEWER
#        180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
#        190	2 FAMILY CONVERSION - ALL STYLES AND AGES

In [None]:
sns.catplot(data = df_numeric, x = "MSSubClass", y = "SalePrice");

In [None]:
box_plot(df_numeric, "OverallCond");

In [None]:
box_plot(homes, "MSZoning");

In [None]:
scatter_rel(df_numeric, "YearBuilt"); 

In [None]:
scatter_rel(df_numeric, "YearRemodAdd"); 

In [None]:
scatter_rel(df_numeric, "LotFrontage"); 

In [None]:
homes['LotFrontage'].isna().count()

In [None]:
scatter_rel(homes, "LotArea"); 

In [None]:
scatter_rel(homes, "LotFrontage"); 

In [None]:
# GrLivArea

homes['GrLivArea'].isna().sum() # no na values
scatter_rel(homes, "GrLivArea"); 

In [None]:
homes['MasVnrType'].head()

In [None]:
homes['MasVnrType'].isna().sum()

In [None]:
homes['MasVnrType'].value_counts()
#None       1559

In [None]:
homes["MasVnrType"].value_counts().plot.bar();



### Dealing with Categorical Values  and determining how to encode

In [None]:
homes.head()

In [None]:
cat = list(df_object.columns) + ['MSSubClass']+ ['OverallQual']+ ['OverallCond']
h = {}
for i in cat:

    h[i] = homes[i].value_counts()
    
    
print(h) #all classes for all categorical columns

In [None]:
box_plot(homes, 'Neighborhood') # MSZoning has not large impact on SalePrice --> nominal

In [None]:
check_class_imbalance(df_object, 11,4)

In [None]:
homes.columns

In [None]:
homes.head(20)

In [None]:
homes['GrLivArea'].value_counts()

In [None]:
sns.scatterplot(data = homes, x = 'GrLivArea', y = 'SalePrice');

In [None]:

plt.title("Sales Price & Relationship Between Remodelling and Construction Date")
sns.scatterplot(data = homes, x = 'YearBuilt', y = 'SalePrice', hue = 'YearRemodAdd');

plt.tight_layout()


In [None]:
plt.title('How Garage Year Built Influences Sale Price')
sns.scatterplot(data = homes[homes.GarageYrBlt>1980], x = 'GarageYrBlt', y = 'SalePrice');
sns.scatterplot(data = homes[homes.GarageYrBlt<1980], x = 'GarageYrBlt', y = 'SalePrice');

In [None]:
homes[homes.GarageYrBlt<2000]['SalePrice'].mean() #1980 (65%) 1990

In [None]:
homes['PavedDrive'].value_counts()

In [None]:
plt.title("Fireplace Quality and Impact on Sale Price")

sns.barplot(data = homes, x = 'MSSubClass', y = 'SalePrice');

In [None]:
# scatter_re

## Feature Selection & Engineering



### Initial Column Drops

<del>'Unnamed: 0'</del>: <i>Provides no valuable information</i>
<br>
<del>'PID'</del>: <i>Provides no valuable information BUT **will be used to merge if second dataset is used </i>
<br>
<del>'Alley'</del>: <i>More than 90% NA Values. Provides no valuable information</i>
<br>
<del>'PoolQC'</del>: <i>More than 90% NA Values. Provides no valuable information</i>
<br>
<del>'Fence'</del>: <i>More than 90% NA Values. Provides no valuable information</i>
<br>
<del>'MiscFeature'</del>: <i>More than 90% NA Values. Provides no valuable information</i>
<br>
<del>'MoSold'</del>: <i>Data Leakage Problem. Column information attained at sale</i>
<br>
<del>'YrSold'</del>: <i>Data Leakage Problem. Column information attained at sale</i>
<br>
<del>'SaleType'</del>: <i>Data Leakage Problem. Column information attained at sale</i>
<br>
<del>'SaleCondition'</del>: <i>Data Leakage Problem. Column information attained at sale</i>
<br>
<del>'MiscVal'</del>: <i>Provides no valuable information </i>
<br>
<del>'PoolArea'</del>: <i>See Feature Engineering Section</i>
<br>
<del>'ScreenPorch',3SsnPorch,EnclosedPorch, OpenPorchSF </del>: <i>See Feature Engineering Section</i>
<br>
<del>'WoodDeckSF'</del>: <i>See Feature Engineering Section</i>
<br>
<del>'GarageQual'</del>: <i>Almost identical to GarageCond especially considering tha minority classes will be combined.</i>
<br>
<del>'GarageCars'</del>: <i>GarageCars and GarageArea are linearly correlated (multicolinearity). GarageCars will be dropped. *** MAY EXPLORE BY INTERCHANGING BOTH AS GarageCars as ordinal Cat. may improve model.</i>
<br>
<del>'FireplaceQu and  Fireplaces '</del>: <i>Since almost half od the homes in the dataset do not have a fireplace and Fireplace Quality classes are skewed, these columns will be dropped and new one will be created to show if fireplace/nofireplace no matter how many fireplaces .See Feature Engineering Section</i>
<br>
<del>'Functionality'</del>: <i>Majority of homes are not damaged and homes with minor damages have average sale price that is not too far away from overall average home price. If we had more examples of 'Salvaged' homes, then it would make sense to keep this column.</i>
<br>
<del>'KitchenAbvGr'</del>: <i>No correlation with SalePrice and similar to KitchenQual.</i>
<br>
<del>'BedroomAbvGr'</del>: <i>No Valuable Information</i>
<br>
<del>'BsmtHalfBath'</del>: <i>No Valuable Information</i>
<br>
<del>'BsmtFullBath'</del>: <i>Create new column boolean : bsmntfullbath, See Feature Engineering Section </i>
<br>
<del>'LowQualFinSF'</del>: <i>Provides no valuable information </i>
<br>
<del>'1stFlrSF' and 2ndFlrSF</del>: <i>Combine the two into totalsft. See Feature Engineering </i>
<del>'Electrical'</del>: <i>Create new columns. isStandElectr.See Feature Engineering </i>
<br>
CentralAir: drop and create new one. Is central air or not
<br>
Heating: most data is gas. no valuable information.
<br>
TotalBsmtSF: combine with totalsft
<br>
BsmtUnfSF: not much correlation with price and low volumen.
<br>
BsmtFinSF2
<br>
BsmtFinType2
<br>
BsmtFinSF1
<br>
BsmtFinType1:
<br>
BsmtCond: Similar to BsmtQual but with more class imbalance
<br>
MasVnrType and MasVnrArea: drop and feature engineer (hasMasVnr)
<br>
Exterior2nd
<br>
RoofMatl: Mostly Standards
<br>
Overall condition: overallQual has better correlation with SalePrice (**May keep**)
<br>
Condition2: all normal
<br>
LotConfig: is_culdesac?
<br>
LandSlope: No valuable information
<br>
Utilities: all same class
<br>
LandContour: isLandContourhls? bollean new column
<br>
LotShape: regular or not (boolean)

<br>
Street: everything is paved almost

<br>
LotArea: not much correlation

<br>


<br>

<br>
<br>
<br>
<br>
<br>
### Feature Selection & Engineering

PoolArea: There is no linear relationship between pool area and SalePrice. If more than 0 square/feet, create new column to indicate pool/no pool.
<br>
OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch: All porch related columns have identical relationship with
SalePrice. 4 columns will be dropped and new columns will be created indicating porch/no porch.
<br>
WoodDeckSF: Some linear relationship with SalePrice. If more than 0 square/feet, create new column to indicate wood deck/no wood deck.
<br>
PavedDrive: Imbalanced classes. Most driveways are paved. Combine Minority Classes.
<br>
GarageCond: Imbalanced classes.  (TA    2356 Gd      14 Ex.   3), (Fa      67 Po      11) 
<br>
GarageArea: Only 127NA values. Good linear relationship with SalePrice. 127 entries @0sqrft. Will be used instead of GarageCars
<br>
GarageFinish: 129 NA's (Finished, Rough Finished) Unfinished (ordinal?)
<br>
GarageYrBlt: Up untill 2000, GarageYrBlt has low correlation with SalePrice but after 2000 that relationship increases. New Column will be made of boolean values (built before 2000 and after 2000). There is ~66% change
in average home price when garage is built before 2000 and after 2000. This trend is evident starting from 1980 but since this data is already old (2007-2010), I will use 2000 as the cut-off.**Expecting Similar trend for yrblt of home.**
<br>
GarageType: (Attchd     1527 BuiltIn     153) & (Detchd      716 Basment      27 2Types       21 CarPort       9)
<br>
'FireplaceQu and  Fireplaces ': delete and create new column "YesFireplace"
<br>
TotRmsAbvGrd: As grade goes up, sale price goes up as well until grade 11 and then decline. Ordinal?
<br>
KitchenQual: (TA, Fair, Poor) (Ex) (GD)
<br>
HalfBath: Convert to boolean 0, (1,2) == 1
<br>
FullBath: odinal 1,2,3 bath (3 being best)
<br>
'BsmtFullBath': Create boolean for all basements with full bath
<br>
1stFlrSF' and 2ndFlrSF and TotalBsmtSF : Combine the two columns by adding squarefeet. TotalSQFT (71 homes with no basement)
<br>
Electrical: create new boolean columns. is standard electricity or not
<br>
HeatingQC: create ordinal (Ex) (Good, average) (poor, fair)
<br>
BsmtExposure: Yes or no Boolean
<br>
BsmtQual: ordinal 3-way
<br>
Foundation: combine some minority and ordinal
<br>
MasVnrType and MasVnrArea: new columns hasMasVnr and is MsVnrabv400
<br>
Exterior1st: some encoding
<br>
RoofStyle: Combine Hip/Shade + Gable/everything else
<br>
YearRemodAdd , YearBuilt: is it above yearbuilt above 1980 and yearremod >2000?
<br>
HouseStyle: nominal with minority class combined
<br>
BldgType: 1fam Twnhouse + everything else (nominal)
<br>
Condition1: nominal encoding and combine minority
<br>
Neighborhood : nominal some minoriity
<br>
LotConfig: is cul-de-sac?
<br>
LandContour: isLandContourhls boolean
<br>
LotShape: Regulaar or not Boolean
<br>
LotFrontage:  462 Nas --> impute // important columns
<br>
MSZoning: Nominal (FV RL) (everything else)
<br>
MSSubClass: Nominal and minority




### Feature Selection




### Encoding Columns 
#### Minority Classes of Imbalanced Columns Will Be Combined Based on Impact on Sale Price

<br> 
<br> 
 
<strong><em>Nominal Encoding:</em></strong> <i>There is no order or rank to the variable's feature.</i>
<br> 
<br> 
<b>Ordinal Encoding:</b> <i>There is  order or rank to the variable's feature.</i>

<br>

<del>'Unnamed: 0'</del>: <i>Provides no valuable information</i>
<br>
<del>'PID'</del>: <i>Provides no valuable information BUT **will be used to merge if second dataset is used </i>
<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>

<br>




In [None]:
'''

Nominal Encoding:

'MSZoning' -- >  RL is most of the class
'Street' --> all paved
'Alley' --> Drop (more than 90% )
'LotShape' -->  Ref and IRl
'LandContour'  -- mostly Lvl
'Utilities' -->all one
'LotConfig',  -- > Inside ==> rest
'LandSlope',  -- > Gtl primarily

'Condition1',  --> Norm
'Condition2', --> Drop
'BldgType',  --> mostly 1fam
'HouseStyle',  -- 1story, 2story, rest
'RoofStyle',  --> Gable, hip
'RoofMatl',  --. one 
'Exterior1st', -->4 and the rest


Ordinal Encoding:

'Neighborhood' --> *** DISTANCE?? ***
'Condition'
'Condition1',  --> Norm
'Condition2', --> Drop
Exterior1st --> based on price?
ExterQual --> TA, GD
ExterCond --> TA, GD (dorop either one depending on correlation)
BsmtQual --> TA, Gd, missing 50% or not existing
BsmtCond --> less na (Ta and the rest
KitchenQual


'''

In [None]:
target_correlation(df_numeric,10,4)

In [None]:
homes.head(10)

### Reducing Dimensionality/Random Noise & Feature Engineering

In [None]:
trial = homes.copy()

In [None]:
trial

In [None]:
def preprocessed_DF(df):
    trial = homes.copy()
    
    
    ## How features will be classified based ond EDA and preprocessing 
    
    num_feat = ['GrLivArea','LotFrontage', 'LotArea','YearBuilt','GarageArea', 'TotSft','YearRemodAdd','BsmtFinSF1',
           'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','BsmtFullBath','BsmtHalfBath','FullBath',
           'HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','WoodDeckSF',]

    ord_cat = ['Condition1','Condition2', 'OverallQual','OverallCond','ExterQual','ExterCond',
          'BsmtQual','BsmtCond','KitchenQual','GarageQual','GarageCond', 'HeatingQC', 'Neighborhood','GarageFinish',
          'GarageCars',]


    nom_cat = ['MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope',
          'BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd',
          'Foundation','BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'Heating','CentralAir',
          'Electrical','Functional', 'GarageType', 'GarageFinish']
    
    '''
    *** Feature Engineering based on EDA ***
    '''
    # homes that were built after 1980 and redomeled after 2000 displayed most correlation with increasing sale price
    trial['newBltRmd'] = np.where((homes.YearBuilt > 1980) & (homes.YearRemodAdd > 2000), 1, 0) 
    # MasVnrType has minimal correlation with 
    trial['hasMasVnr'] = np.where((homes.MasVnrArea > 0) & (homes.MasVnrType != 'None'), 1, 0) 
    trial['TotSft'] = trial['TotalBsmtSF']+trial['1stFlrSF'] +trial['2ndFlrSF']
    trial['UnfBsmt'] = np.where(trial.BsmtUnfSF>0, 1,0)
    trial['NewGarage'] = np.where(trial['GarageYrBlt']>2000,1,0)
    trial['HasWoodDk'] = np.where(trial['WoodDeckSF']>0,1,0)
    trial['HasPorch'] = np.where((homes.OpenPorchSF > 0) | (homes.EnclosedPorch > 0) 
                             |(homes['3SsnPorch'] > 0) |(homes.ScreenPorch > 0) , 1, 0) 
    trial['HasPool'] = np.where(trial.PoolArea>0,1,0)
    trial['PavedDrWay'] = np.where(trial.PavedDrive  == 'Y',1,0) #combined minority classes
    trial.drop(['Unnamed: 0','Alley', 'FireplaceQu','PoolQC','Fence','MiscFeature','MiscVal',
           'MoSold','YrSold','SaleType','SaleCondition'], axis = 1, inplace = True)
    trial['HasBsmntType2'] = np.where(trial.BsmtFinSF2 >0, 1,0)
    
    
    

In [None]:
trial['newBltRmd'] = np.where((homes.YearBuilt > 1980) & (homes.YearRemodAdd > 2000), 1, 0) 
#removing remodel year. adding boolean if home built after 1980 and remodeled after 2000
#trial.drop('YearRemodAdd', axis = 1, inplace = True)

In [1]:
trial.head()

NameError: name 'trial' is not defined

In [None]:
trial['hasMasVnr'] = np.where((homes.MasVnrArea > 0) & (homes.MasVnrType != 'None'), 1, 0) 
#trial.drop(['MasVnrArea', 'MasVnrType'], axis = 1, inplace = True)
#since all homes with MasVnr were correlated with price identically, we only need to check if a home has the feature.

In [None]:
trial['TotSft'] = trial['TotalBsmtSF']+trial['1stFlrSF'] +trial['2ndFlrSF']
#trial.drop(['2ndFlrSF','BsmtFinSF2','TotalBsmtSF', '1stFlrSF','2ndFlrSF' ], axis = 1, inplace = True)
#trial.drop('BsmtFinSF1', axis = 1, inplace = True)

In [None]:
trial['UnfBsmt'] = np.where(trial.BsmtUnfSF>0, 1,0)
#trial.drop('BsmtUnfSF',axis = 1, inplace = True)

In [None]:
trial['NewGarage'] = np.where(trial['GarageYrBlt']>2000,1,0)
#trial.drop('GarageYrBlt', axis = 1, inplace = True)

In [None]:
trial['HasWoodDk'] = np.where(trial['WoodDeckSF']>0,1,0)
#trial.drop('WoodDeckSF',axis = 1, inplace = True)

In [None]:
trial['HasPorch'] = np.where((homes.OpenPorchSF > 0) | (homes.EnclosedPorch > 0) 
                             |(homes['3SsnPorch'] > 0) |(homes.ScreenPorch > 0) , 1, 0) 


#trial.drop(['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'], axis = 1, inplace = True)




In [None]:
trial['HasPool'] = np.where(trial.PoolArea>0,1,0)
#trial.drop('PoolArea', axis = 1, inplace = True)

In [None]:
trial['PavedDrWay'] = np.where(trial.PavedDrive  == 'Y',1,0) #combined minority classes
trial.drop(['Unnamed: 0','Alley', 'FireplaceQu','PoolQC','Fence','MiscFeature','MiscVal',
           'MoSold','YrSold','SaleType','SaleCondition'], axis = 1, inplace = True)

In [None]:
trial['HasBsmntType2'] = np.where(trial.BsmtFinSF2 >0, 1,0)

In [None]:
trial.shape

In [None]:
trial.info()

In [None]:
trial.head(10)

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.experimental import enable_iterative_imputer #enables iterative imputer 
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer,MissingIndicator
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,RobustScaler


In [None]:
num_feat = ['GrLivArea','LotFrontage', 'LotArea','YearBuilt','GarageArea', 'TotSft','YearRemodAdd','BsmtFinSF1',
           'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','BsmtFullBath','BsmtHalfBath','FullBath',
           'HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','WoodDeckSF',]

ord_cat = ['Condition1','Condition2', 'OverallQual','OverallCond','ExterQual','ExterCond',
          'BsmtQual','BsmtCond','KitchenQual','GarageQual','GarageCond', 'HeatingQC', 'Neighborhood','GarageFinish',
          'GarageCars',]


nom_cat = ['MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope',
          'BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd',
          'Foundation','BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'Heating','CentralAir',
          'Electrical','Functional', 'GarageType', 'GarageFinish']


In [None]:
impute = IterativeImputer(n_nearest_features=10,initial_strategy='median',max_iter=200,random_state=101)

In [None]:
trial[num_feat] = impute.fit_transform(trial[num_feat])

In [None]:
scaler = RobustScaler()
trial[num_feat] = scaler.fit_transform(trial[num_feat])

In [None]:
ordEncode = OrdinalEncoder()
trial[ord_cat] = ordEncode.fit_transform(trial[ord_cat])

In [None]:
impute_nom = SimpleImputer(strategy='most_frequent')
trial[nom_cat] = impute_nom.fit_transform(trial[nom_cat])



In [None]:
dumm = pd.get_dummies(trial[nom_cat],prefix='_is', drop_first=True)

In [None]:
trial.drop(nom_cat,axis=1, inplace=True)
trial = trial.join(dumm)

In [None]:
trial.drop(['MasVnrType', 'PavedDrive'], axis = 1, inplace=True)

In [None]:
# ct = ColumnTransformer(
#     [("impute_num", IterativeImputer(n_nearest_features=10, 
#                                                  initial_strategy='median',max_iter=200,random_state=101),num_feat),
#      ("scale", RobustScaler(), num_feat),
#    ("ordinal", OrdinalEncoder(), ord_cat),
#     ("nominal", OneHotEncoder(drop='first'),nom_cat),
#     ("impute_nom", SimpleImputer(strategy='most_frequent'), nom_cat)],remainder='passthrough',verbose_feature_names_out=False)


# ct.fit_transform(trial)



In [None]:
trial.fillna(1.0,inplace=True)

In [None]:
trial.isna().sum()

In [None]:
y = np.log(trial.SalePrice)

In [None]:
X = trial.drop(['SalePrice','PID'],axis=1)
X

In [None]:
# sts = RobustScaler(quantile_range=(30.0,70.0)) # scaling has no impact on score whether standard or robust (for now)
# X[num_feat] = sts.fit_transform(X[num_feat])


In [None]:
X.head()

In [None]:
X.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor
from sklearn.metrics import r2_score
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30, random_state=101)

mlr = LinearRegression()
mlr.fit(X_train, y_train)
y_pred = mlr.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
sns.regplot(y_test, y_pred);

In [None]:
import statsmodels.api as sm

In [None]:
x_const=sm.add_constant(X_train)
model = sm.OLS(y_train, x_const)
ols = model.fit()
print(ols.summary())


ols.pvalues

In [None]:
pvalue = pd.DataFrame(ols.pvalues, columns=['PValue'])
new_features = list((pvalue[pvalue.PValue<0.05]).index)

In [None]:
pvalue[pvalue.PValue<0.05]

In [None]:
X1 = trial[new_features[1:]]

'''

From 179 to 81 columns

'''
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y,test_size=.30, random_state=101)

mlr1 = LinearRegression()
mlr1.fit(X_train1, y_train1)
y_pred1 = mlr1.predict(X_test1)
r2_score(y_test1, y_pred1)




In [None]:
sns.regplot(y_test, y_pred);

# Machine Learning Models



### Support Vector Machine (SVR)

In [None]:
from sklearn.svm import SVR

In [None]:
trial.head()

## Decision Tree & Random Forest, XGBoost

In [None]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

## Problems With The Dataset

 - Some Categorical Columns are very imbalanced, hence some of them were dropped as to not interfere with our model authenticity. Due to this, some information was lost and our model will only work with specific homes. To illustrate, because we had to drop 'Functionality' columns due to insufficient examples of 'salvage' homes and skewed majority class, we can only predict on homes with no degree of damage. Another example would be type of home. Since most homes in the dataset are for single-family homes, our model would not work on any other type

In [None]:
import os

In [None]:
dir(os)

In [None]:
/Users/tiko/Documents/Machine_Learning_Ames_Housing_Kaggle/Tigran_V