In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('Housing_Prices/train.csv')

data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Create X and y
y = data[['SalePrice']]
X = data.drop(columns=['SalePrice', 'Id'], axis=1)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
y_test.shape

(146, 1)

In [11]:
X_test.shape

(146, 79)

In [4]:
# Save y
y_train.to_csv('data/y_train.csv', index=False)
y_val.to_csv('data/y_val.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

In [5]:
# Remove "object"-type features from X
cont_features = [col for col in X.columns if X[col].dtype in [np.float64, np.int64]]

# Remove "object"-type features from X_train and X_test
X_train_cont = X_train.loc[:, cont_features]
X_val_cont = X_val.loc[:, cont_features]
X_test_cont = X_test.loc[:, cont_features]

# Impute missing values with median using SimpleImputer
impute = SimpleImputer(strategy='median')
X_train_numeric = pd.DataFrame(impute.fit_transform(X_train_cont), columns=cont_features)
X_val_numeric = pd.DataFrame(impute.transform(X_val_cont), columns=cont_features)
X_test_numeric = pd.DataFrame(impute.transform(X_test_cont), columns=cont_features)

In [13]:
X_test_cont.shape

(146, 36)

In [14]:
X_test_numeric.shape

(146, 36)

In [6]:
# Save numeric X
X_train_numeric.to_csv('data/X_train_numeric.csv', index=False)
X_val_numeric.to_csv('data/X_val_numeric.csv', index=False)
X_test_numeric.to_csv('data/X_test_numeric.csv', index=False)

In [7]:
# Create X_cat which contains only the categorical variables
features_cat = [col for col in X.columns if X[col].dtype in [np.object]]
X_train_cat = X_train.loc[:, features_cat]
X_val_cat = X_val.loc[:, features_cat]
X_test_cat = X_test.loc[:, features_cat]

# Fill missing values with the string 'missing'
X_train_cat.fillna(value='missing', inplace=True)
X_val_cat.fillna(value='missing', inplace=True)
X_test_cat.fillna(value='missing', inplace=True)

In [22]:
X_test_cat.shape

(146, 260)

In [21]:
X_test_numeric.shape

(146, 36)

In [19]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 892 to 1392
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     146 non-null    int64  
 1   MSZoning       146 non-null    object 
 2   LotFrontage    124 non-null    float64
 3   LotArea        146 non-null    int64  
 4   Street         146 non-null    object 
 5   Alley          5 non-null      object 
 6   LotShape       146 non-null    object 
 7   LandContour    146 non-null    object 
 8   Utilities      146 non-null    object 
 9   LotConfig      146 non-null    object 
 10  LandSlope      146 non-null    object 
 11  Neighborhood   146 non-null    object 
 12  Condition1     146 non-null    object 
 13  Condition2     146 non-null    object 
 14  BldgType       146 non-null    object 
 15  HouseStyle     146 non-null    object 
 16  OverallQual    146 non-null    int64  
 17  OverallCond    146 non-null    int64  
 18  YearBui

In [18]:
X_test_numeric

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,20.0,70.0,8414.0,6.0,8.0,1963.0,2003.0,0.0,663.0,0.0,...,264.0,192.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2006.0
1,60.0,98.0,12256.0,8.0,5.0,1994.0,1995.0,362.0,1032.0,0.0,...,712.0,186.0,32.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
2,30.0,56.0,8960.0,5.0,6.0,1927.0,1950.0,0.0,0.0,0.0,...,360.0,0.0,0.0,130.0,0.0,0.0,0.0,0.0,3.0,2010.0
3,50.0,50.0,5000.0,6.0,7.0,1947.0,1950.0,0.0,399.0,0.0,...,420.0,0.0,24.0,36.0,0.0,0.0,0.0,0.0,10.0,2006.0
4,20.0,89.0,12898.0,9.0,5.0,2007.0,2008.0,70.0,1022.0,0.0,...,912.0,228.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2009.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,20.0,70.0,13300.0,5.0,7.0,1956.0,2000.0,0.0,377.0,0.0,...,252.0,261.0,0.0,156.0,0.0,0.0,0.0,0.0,6.0,2007.0
142,60.0,77.0,9206.0,6.0,5.0,1985.0,1985.0,336.0,0.0,0.0,...,476.0,192.0,46.0,0.0,0.0,0.0,0.0,0.0,6.0,2010.0
143,50.0,60.0,8400.0,6.0,5.0,1925.0,1950.0,0.0,423.0,0.0,...,576.0,342.0,0.0,128.0,0.0,0.0,0.0,0.0,6.0,2008.0
144,190.0,100.0,34650.0,5.0,5.0,1955.0,1955.0,0.0,1056.0,0.0,...,572.0,264.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0


In [16]:
X_test_cat.shape

(146, 260)

In [8]:
# OneHotEncode categorical variables
ohe = OneHotEncoder(handle_unknown='ignore')

# Transform training and test sets
X_train_ohe = ohe.fit_transform(X_train_cat)
X_val_ohe = ohe.transform(X_val_cat)
X_test_ohe = ohe.transform(X_test_cat)

# Convert these columns into a DataFrame 
cat_columns = ohe.get_feature_names(input_features=X_train_cat.columns)
X_train_cat = pd.DataFrame(X_train_ohe.todense(), columns=cat_columns)
X_val_cat = pd.DataFrame(X_val_ohe.todense(), columns=cat_columns)
X_test_cat = pd.DataFrame(X_test_ohe.todense(), columns=cat_columns)

In [9]:
# Save categorical X
X_train_cat.to_csv('data/X_train_cat.csv', index=False)
X_val_cat.to_csv('data/X_val_cat.csv', index=False)
X_test_cat.to_csv('data/X_test_cat.csv', index=False)