In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [2]:
df = pd.read_csv('../data/test (1).csv' , index_col = 'Id')

# Exploring the dataset

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 879 entries, 2658 to 1939
Data columns (total 79 columns):
PID                879 non-null int64
MS SubClass        879 non-null int64
MS Zoning          879 non-null object
Lot Frontage       719 non-null float64
Lot Area           879 non-null int64
Street             879 non-null object
Alley              58 non-null object
Lot Shape          879 non-null object
Land Contour       879 non-null object
Utilities          879 non-null object
Lot Config         879 non-null object
Land Slope         879 non-null object
Neighborhood       879 non-null object
Condition 1        879 non-null object
Condition 2        879 non-null object
Bldg Type          879 non-null object
House Style        879 non-null object
Overall Qual       879 non-null int64
Overall Cond       879 non-null int64
Year Built         879 non-null int64
Year Remod/Add     879 non-null int64
Roof Style         879 non-null object
Roof Matl          879 non-null object
E

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PID,879.0,716505000.0,188913500.0,526302110.0,528486085.0,535454160.0,907192140.0,1007100000.0
MS SubClass,879.0,58.27076,42.21139,20.0,20.0,50.0,70.0,190.0
Lot Frontage,719.0,69.63004,23.62537,21.0,59.0,68.0,80.0,182.0
Lot Area,879.0,10340.92,10047.34,1477.0,7298.5,9453.0,11606.5,215245.0
Overall Qual,879.0,6.054608,1.374756,2.0,5.0,6.0,7.0,10.0
Overall Cond,879.0,5.565415,1.128422,1.0,5.0,5.0,6.0,9.0
Year Built,879.0,1970.534,30.40353,1880.0,1954.0,1972.0,2000.0,2010.0
Year Remod/Add,879.0,1984.445,20.45455,1950.0,1967.0,1992.0,2003.0,2010.0
Mas Vnr Area,878.0,106.9829,188.3568,0.0,0.0,0.0,173.5,1378.0
BsmtFin SF 1,879.0,443.397,442.4802,0.0,0.0,374.0,735.5,2288.0


In [14]:
df.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2006,WD
2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,8,2006,WD
2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,9,2006,New
1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,...,0,0,0,,,,0,7,2007,WD
625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,...,0,185,0,,,,0,7,2009,WD


# Some exploratory data analysis

# Data Cleaning

In [15]:
df.loc[df['Garage Yr Blt'] == 2207.0, 'Garage Yr Blt'] = np.mean(df['Garage Yr Blt'])

In [16]:
columns_na = {'Lot Frontage': 0.0,
              'Alley': 'No Alley',
              'Mas Vnr Type': 'None',
              'Mas Vnr Area': 0.0,
              'Bsmt Qual': 'No Basement',
              'Bsmt Cond': 'No Basement',
              'Bsmt Exposure': 'No Basement',
              'BsmtFin Type 1': 'No Basement',
              'BsmtFin SF 1': 0.0,
              'BsmtFin Type 2': 'No Basement',
              'BsmtFin SF 2': 0.0,
              'Bsmt Unf SF': 0.0,
              'Total Bsmt SF': 0.0,
              'Bsmt Full Bath': 0.0,
              'Bsmt Half Bath': 0.0,
              'Fireplace Qu': 'No Fireplace',
              'Garage Type': 'No Garage',
              'Garage Yr Blt': np.mean(df['Garage Yr Blt']),
              'Garage Finish': 'No Garage',
              'Garage Cars': 0.0,
              'Garage Area': 0.0,
              'Garage Qual': 'No Garage',
              'Garage Cond': 'No Garage',
              'Pool QC': 'No Pool',
              'Fence': 'No Fence',
              'Misc Feature': 'None'}


In [17]:
df.fillna(value = columns_na, inplace=True)

In [18]:
df.isnull().sum().sum()

1

# Creating Dummy Variables

 Based on the scatter plots made in the exploratory data analysis section, I picked out the numerical,
 categorical variables and put them in a list in order to turn them into objects so I can make 
 dummy columns out of them

In [19]:
columns_to_object = ['MS SubClass', 'Overall Qual', 'Overall Cond', 'Bsmt Full Bath', 'Bsmt Half Bath',
                  'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
                  'Fireplaces', 'Garage Cars', 'Mo Sold', 'Yr Sold']

In [20]:
for col in columns_to_object:
    df[col] = df[col].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 879 entries, 2658 to 1939
Data columns (total 79 columns):
PID                879 non-null int64
MS SubClass        879 non-null object
MS Zoning          879 non-null object
Lot Frontage       879 non-null float64
Lot Area           879 non-null int64
Street             879 non-null object
Alley              879 non-null object
Lot Shape          879 non-null object
Land Contour       879 non-null object
Utilities          879 non-null object
Lot Config         879 non-null object
Land Slope         879 non-null object
Neighborhood       879 non-null object
Condition 1        879 non-null object
Condition 2        879 non-null object
Bldg Type          879 non-null object
House Style        879 non-null object
Overall Qual       879 non-null object
Overall Cond       879 non-null object
Year Built         879 non-null int64
Year Remod/Add     879 non-null int64
Roof Style         879 non-null object
Roof Matl          879 non-null obje

In [21]:
df_object = pd.get_dummies(df[list(df.select_dtypes(include="object"))], 
                           prefix = list(df.select_dtypes(include="object").columns), 
                           columns = df.select_dtypes(include="object").columns)
df = pd.concat([df, df_object], axis = 1, join = 'outer')

In [22]:
#df.drop(['MS SubClass', 'Overall Qual', 'Overall Cond', 'Bsmt Full Bath', 'Bsmt Half Bath',
         #'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd',
         #'Fireplaces', 'Garage Cars', 'Mo Sold', 'Yr Sold'], axis=1, inplace=True)

# Save the cleaned data

In [23]:
df.to_csv('../data/clean_data_test.csv')