### ***Import All Required Libraries***

In [502]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

### ***Load Train and Test Data***  

In [503]:
# Read in the train and test dataframes
train_df = pd.read_csv('./datasets/train.csv')
test_df = pd.read_csv('./datasets/test.csv')

In [504]:
train_df.head(1)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,2Story,6,8,1976,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,289.0,Gd,TA,CBlock,TA,TA,No,GLQ,533.0,Unf,0.0,192.0,725.0,GasA,Ex,Y,SBrkr,725,754,0,1479,0.0,0.0,2,1,3,1,Gd,6,Typ,0,,Attchd,1976.0,RFn,2.0,475.0,TA,TA,Y,0,44,0,0,0,0,,,,0,3,2010,WD,130500


In [505]:
test_df.head(1)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD


In [506]:
# https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe/49189503
pd.set_option('display.max_columns', None)

### ***Function***  

**Note:** Before running this function make sure the **01-04** notebooks have been run first.

In [507]:
# We need to clean the data :) This function should get me datasets of only the features I want
def clean_the_dfs(train, test):

# **CONTINOUS** FEATURES   
    cont_train = pd.read_csv('../project_2-master/datasets/continous_train.csv')
    cont_test = pd.read_csv('../project_2-master/datasets/continous_test.csv')
       
# MOVING ONTO **CATEGORICAL** FEATURES
    # Get dummies for the neighborhood column for TRAIN
    train_dummies = train[['Id','Neighborhood']]
    train_new = pd.get_dummies(columns=['Neighborhood'], data=train_dummies, drop_first=True)
    
    # Get dummies for the neighborhood column for TEST
    test_dummies = test[['Id','Neighborhood']]
    test_new = pd.get_dummies(columns=['Neighborhood'], data=test_dummies, drop_first=True)
    
    # This finds the columns that are not the same
    # https://stackoverflow.com/questions/35713093/how-can-i-compare-two-lists-in-python-and-return-not-matches/35713174
    new_list = [list(set(train_new.columns[1:]).difference(test_new.columns[1:]))]
    
    # This drops the columns that are not the same in the train and test datasets
    train_new.drop(columns=[[n for n in new_list][0][0], [n for n in new_list][0][1]], inplace=True)
    
# MERGING THE CONTINOUS AND CATEGORICAL FEATURES TOGETHER TRAIN AND TEST
    # Merging the continous and categorical for train
    merged_train = cont_train.merge(train_new, on='Id')
    # Merging the continous and categorical for train
    merged_test = cont_test.merge(test_new, on='Id')
    
# MERGING ORDINAL WITH THE CONTINOUS & CATEGORICAL FEATURES
    ordinal_train = pd.read_csv('../project_2-master/datasets/ordinal_train.csv')
    ordinal_test = pd.read_csv('../project_2-master/datasets//ordinal_test.csv')
    
    merged_train = merged_train.merge(ordinal_train, on='Id')
    merged_test = merged_test.merge(ordinal_test, on='Id')
    
    
# MERGING DISCRETE WITH THE ORDINAL, CONTINOUS & CATEGORICAL FEATURES
    discrete_train = pd.read_csv('../project_2-master/datasets/discrete_train.csv')
    discrete_test = pd.read_csv('../project_2-master/datasets/discrete_test.csv')
    
    merged_train = merged_train.merge(discrete_train, on='Id')
    merged_test = merged_test.merge(discrete_test, on='Id')

    
# SETTING THE INDEX TO 'Id'
    # Set the index to 'Id' for merged_train dataframe
    merged_train.set_index('Id', inplace=True)
     # Set the index to 'Id' for merged_test dataframe
    merged_test.set_index('Id', inplace=True)
    
    
# FEATURE ENGINEERING
    merged_train['Grla_tbsf'] = merged_train['Gr Liv Area'] * merged_train['Total Bsmt SF']
    merged_test['Grla_tbsf'] = merged_test['Gr Liv Area'] * merged_test['Total Bsmt SF']
    
    merged_train['ovl_ext'] = merged_train['Overall Qual'] * merged_train['Exter Qual']
    merged_test['ovl_ext'] = merged_test['Overall Qual'] * merged_test['Exter Qual']
    
    merged_train['ovl_area'] = merged_train['Overall Qual'] * merged_train['Gr Liv Area']
    merged_test['ovl_area'] = merged_test['Overall Qual'] * merged_test['Gr Liv Area']
    
    merged_train['gr_ga'] = merged_train['Gr Liv Area'] * merged_train['Garage Area']
    merged_test['gr_ga'] = merged_test['Gr Liv Area'] * merged_test['Garage Area']
    
# SAVE DF TO CSV
    merged_train.to_csv('../project_2-master/datasets/merged_train.csv')
    merged_test.to_csv('../project_2-master/datasets/merged_test.csv')
    
    
    return merged_train


In [508]:
clean_the_dfs(train_df, test_df)

Unnamed: 0_level_0,Total Bsmt SF,1st Flr SF,Gr Liv Area,Garage Area,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Overall Qual,Exter Qual,Bsmt Qual,Kitchen Qual,Year Built,Full Bath,TotRms AbvGrd,Garage Cars,Grla_tbsf,ovl_ext,ovl_area,gr_ga
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
109,725.0,725,1479,475.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,6,4,3,4,1976,2,6,2.0,1072275.0,24,8874,702525.0
544,913.0,913,2122,559.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,7,4,4,4,1996,2,8,2.0,1937386.0,28,14854,1186198.0
153,1057.0,1057,1057,246.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,5,3,3,4,1953,1,5,1.0,1117249.0,15,5285,260022.0
318,384.0,744,1444,400.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,3,4,3,2006,2,7,2.0,554496.0,15,7220,577600.0
255,676.0,831,1445,484.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,6,3,2,3,1900,2,6,2.0,976820.0,18,8670,699380.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,1884.0,1728,1728,520.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,8,4,4,4,2007,2,7,2.0,3255552.0,32,13824,898560.0
785,861.0,861,861,539.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,3,3,3,1940,1,4,2.0,741321.0,12,3444,464079.0
916,896.0,1172,1913,342.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,3,3,3,1928,1,9,2.0,1714048.0,18,11478,654246.0
639,1200.0,1200,1200,294.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,4,3,3,3,1956,1,6,1.0,1440000.0,12,4800,352800.0
