# Read raw data: Ames Housing

Contents
 - load data
 - data manipulation
   - variables to use
   - dealing with categorical variables
   - train - val - test split
 - save data

Sources:
 - http://ww2.amstat.org/publications/jse/v19n3/decock.pdf
 - https://www.kaggle.com/c/house-prices-advanced-regression-techniques/

Copyright (C) 2018 Alan Chalk  
Please do not distribute or publish without permission.

## Start_.

### Import any packages needed

In [105]:
# Your code here to import any packages you need
import os

import pandas as pd #data manipulation
import numpy as np #data science package (math)
import re #to deal with regex
import pickle


In [106]:
# print the version of numpy and pandas.  e.g. print(np.__version__)
print("pandas version: "+pd.__version__)
print("numpy version: "+np.__version__)
print("re version: "+re.__version__)
# print("pickle version: "+pickle.__version__)

pandas version: 0.23.1
numpy version: 1.16.4
re version: 2.2.1


### Set directories and paths

In [107]:
# If convenient for you, define input and PData directories.  e.g. dirRawData = "../input/"
print(os.getcwd()) #double check cwd is correct
#define input and PData directories
dirInput = "../input/"
dirPData = "../PData/"



/home/jovyan/Projects/AmesHousing/PCode


### Load data

 - Using the dictionary below (ames_dtypes), use the read_csv function in Pandas to read AmesHousing.txt
 - The name of the dataframe you create should be df_all
 - Inspect the top few lines to make sure they look sensible
 - Note that the column names contain . to separate words and they contain capitals, e.g. Lot.Frontage.  Change all column names so that capitals are replaced with lower case and periods (.) are replaced with underscores (_).  One way to do this is 
   - Use sub from the re package to replace . with underscore (_) in all column names
   - Use the .lower method on each string to change all letters to lower case

In [108]:
ames_dtypes = {'Order': np.int64,
             'PID': np.int64,
             'MS.SubClass': np.object,
             'MS.Zoning': np.object,
             'Lot.Frontage': np.float64,
             'Lot.Area': np.float64,
             'Street': np.object, 
             'Alley': np.object, 
             'Lot.Shape': np.object, 
             'Land.Contour': np.object,  
             'Lot.Config': np.object,
             'Land.Slope': np.object,
             'Neighborhood': np.object,
             'Condition.1': np.object,
             'Condition.2': np.object,
             'Bldg.Type': np.object,
             'House.Style': np.object,
             'Overall.Qual': np.float64,
             'Overall.Cond': np.float64,
             'Year.Built': np.float64,
             'Year.Remod.Add': np.float64,
             'Roof.Style': np.object,
             'Roof.Matl': np.object,
             'Exterior.1st': np.object,
             'Exterior.2nd': np.object,
             'Mas.Vnr.Type': np.object,
             'Mas.Vnr.Area': np.float64,
             'Exter.Qual': np.object,
             'Exter.Cond': np.object,
             'Foundation': np.object, 
             'Bsmt.Qual': np.object,
             'Bsmt.Cond': np.object,
             'Bsmt.Exposure': np.object,
             'BsmtFin.Type.1': np.object,
             'BsmtFin.SF.1': np.float64,
             'BsmtFin.Type.2': np.object,
             'BsmtFin.SF.2': np.float64,
             'Bsmt.Unf.SF': np.float64,
             'Total.Bsmt.SF': np.float64,
             'Heating': np.object,
             'Heating.QC': np.object,
             'Central.Air': np.object,
             'Electrical': np.object,
             'X1st.Flr.SF': np.float64,
             'X2nd.Flr.SF': np.float64,
             'Low.Qual.Fin.SF': np.float64,
             'Gr.Liv.Area': np.float64, 
             'Bsmt.Full.Bath': np.float64,
             'Bsmt.Half.Bath': np.float64,
             'Full.Bath': np.float64,
             'Half.Bath': np.float64,
             'Bedroom.AbvGr': np.float64,
             'Kitchen.AbvGr': np.float64,
             'Kitchen.Qual': np.object,
             'TotRms.AbvGrd': np.float64, 
             'Functional': np.object, 
             'Fireplaces': np.float64, 
             'Fireplace.Qu': np.object,
             'Garage.Type': np.object, 
             'Garage.Yr.Blt': np.float64,
             'Garage.Finish': np.object,
             'Garage.Cars': np.float64,
             'Garage.Area': np.float64, 
             'Garage.Qual': np.object, 
             'Garage.Cond': np.object, 
             'Paved.Drive': np.object,
             'Wood.Deck.SF': np.float64,
             'Open.Porch.SF': np.float64,
             'Enclosed.Porch': np.float64, 
             'X3Ssn.Porch': np.float64,
             'Screen.Porch': np.float64,
             'Pool.Area': np.float64,
             'Fence': np.object,
             'Misc.Feature': np.object,
             'Misc.Val': np.float64,
             'Mo.Sold': np.float64, 
             'Yr.Sold': np.float64,
             'Sale.Type': np.object,
             'Sale.Condition': np.object,
             'SalePrice': np.float64}
    


In [109]:
#default seperator is comma, but data is seperated with spaces so change sep, set dtype as prev. defined dict
df_all = pd.read_csv(dirInput+'AmesHousing.txt',sep=' ', dtype = ames_dtypes, na_values = 'NA')
df_all.head() #inspect at first few rows

Unnamed: 0,Order,PID,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Street,Alley,Lot.Shape,Land.Contour,...,Screen.Porch,Pool.Area,Fence,Misc.Feature,Misc.Val,Mo.Sold,Yr.Sold,Sale.Type,Sale.Condition,SalePrice
0,1,526301100,MS_20,RL,141.0,31770.0,Pave,DoesNotHaveOne,IR1,Lvl,...,0.0,0.0,DoesNotHaveOne,DoesNotHaveOne,0.0,5.0,2010.0,WD,Normal,215000.0
1,2,526350040,MS_20,RH,80.0,11622.0,Pave,DoesNotHaveOne,Reg,Lvl,...,120.0,0.0,MnPrv,DoesNotHaveOne,0.0,6.0,2010.0,WD,Normal,105000.0
2,3,526351010,MS_20,RL,81.0,14267.0,Pave,DoesNotHaveOne,IR1,Lvl,...,0.0,0.0,DoesNotHaveOne,DoesNotHaveOne,12500.0,6.0,2010.0,WD,Normal,172000.0
3,4,526353030,MS_20,RL,93.0,11160.0,Pave,DoesNotHaveOne,Reg,Lvl,...,0.0,0.0,DoesNotHaveOne,DoesNotHaveOne,0.0,4.0,2010.0,WD,Normal,244000.0
4,5,527105010,MS_60,RL,74.0,13830.0,Pave,DoesNotHaveOne,IR1,Lvl,...,0.0,0.0,MnPrv,DoesNotHaveOne,0.0,3.0,2010.0,WD,Normal,189900.0


In [110]:
# Note that the column names contain . to separate words and they contain capitals, e.g. Lot.Frontage. 
# Change all column names so that capitals are replaced with lower case and periods (.) are replaced with 
# underscores (_). One way to do this is:
# Use sub from the re package to replace . with underscore (_) in all column names
# Use the .lower method on each string to change all letters to lower case


#df_all.columns is an index, diff to work with so let's work with values of index
columns = df_all.columns.values
#write a function to replace . with _ and convert string chars to lowercase
# test = "This.Is.A.Test" # to test function
def convert(colname):
    #replace '.' with '_'
    colname = re.sub('\.','_',colname) #need backslash to escape '.' as it is a special char
    colname = colname.lower() #change to lower case
    return colname
# print(convert(test))
#Apply function. Naively can do using forloop
# for idx, colname in enumerate(columns):
#     columns[idx] = convert(colname)
# print(columns)

#alternatively can use built in functions such as map
# ?map 
#map is an iterator and returns values one at a time, so must force it into a list
columns=list(map(convert, columns))
df_all.columns = columns #assign df_all.columns to revised columns
df_all.head() #inspect first few rows

Unnamed: 0,order,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice
0,1,526301100,MS_20,RL,141.0,31770.0,Pave,DoesNotHaveOne,IR1,Lvl,...,0.0,0.0,DoesNotHaveOne,DoesNotHaveOne,0.0,5.0,2010.0,WD,Normal,215000.0
1,2,526350040,MS_20,RH,80.0,11622.0,Pave,DoesNotHaveOne,Reg,Lvl,...,120.0,0.0,MnPrv,DoesNotHaveOne,0.0,6.0,2010.0,WD,Normal,105000.0
2,3,526351010,MS_20,RL,81.0,14267.0,Pave,DoesNotHaveOne,IR1,Lvl,...,0.0,0.0,DoesNotHaveOne,DoesNotHaveOne,12500.0,6.0,2010.0,WD,Normal,172000.0
3,4,526353030,MS_20,RL,93.0,11160.0,Pave,DoesNotHaveOne,Reg,Lvl,...,0.0,0.0,DoesNotHaveOne,DoesNotHaveOne,0.0,4.0,2010.0,WD,Normal,244000.0
4,5,527105010,MS_60,RL,74.0,13830.0,Pave,DoesNotHaveOne,IR1,Lvl,...,0.0,0.0,MnPrv,DoesNotHaveOne,0.0,3.0,2010.0,WD,Normal,189900.0


### Store the dataset and relevant variables

 - Create a dict which contains 'df_all' only with a key of df_all, e.g. dict_ = {'df_all': df_all}
 - Use pickle.dump to save the dict to /PData/01_df_all.pickle

If you have not used pickle before, see the examples at the bottom of this page: https://docs.python.org/3/library/pickle.html

In [111]:
#store = pd.HDFStore(dirPData + '01_df_all.h5')
#df_all.to_hdf(store, 'df_all')
#store.close()

In [112]:
dict_ = {'df_all': df_all}

#declare file name for convenience
f_name = dirPData +'01_df_all.pickle'

with open(f_name, 'wb') as f: #put in file as white binary
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(dict_, f) #dump dictionary to open file
    #note that it is automatically closed when we're done

del f_name #deletes cuz he doesn't like stuff hanging around? Seems unnecessary