In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import numpy as np
print(sys.version)

# Read data from CSV

`pd.read_csv`: takes in a csv and returns a pandas dataframe.

`train_test_split`: splits arrays or matrices into random train and tests sets.
- `train_size`: 0-1 value representing the percent of the dataset in the training set.
- `test_size`: see above, but for test set.
- `random_state`: state used for shuffling. To replicate results, use same `random_state`.

In the next cell, we remove the SalePrice labelled column from the feature columns.

In [None]:
# Read data from train csv
data = pd.read_csv('../datasets/kaggle-house-prices/train.csv', index_col="Id")
X_test_full = pd.read_csv('../datasets/kaggle-house-prices/test.csv', index_col="Id")

# Drop rows with missing label
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
data.dropna(axis=1, inplace=True)
data.head()


In [None]:
# Separate label from features
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1)

# Run train_test_split to split data 80% train 20% validation
X_train_full, X_valid_full, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0)

## Some tools to get dataframe information

In [None]:
X_train_full.describe()
X_train_full.head()
X_train_full.columns

# Data Preprocessing

Low cardinality columns: columns that have a low number of unique items. This allows us to do some learning based on these cols without adding too much noise.

Numeric columns: columns with numeric values as opposed to "object" columns.

In this example, we select the columns to train based on columns that are numeric and columns with low cardinality. You may want to change this depending on your dataset.

In [None]:
# Low cardinality columns
# Returns a list of categorical columns with low cardinality
    # Unique values < 10
low_cardinality_cols = [colName for colName in X_train_full.columns 
                        if X_train_full[colName].nunique() < 10 and 
                            X_train_full[colName].dtype == "object"]

# Numeric columns
# Returns a list of numeric columns
numeric_cols = [colName for colName in X_train_full.columns
                if X_train_full[colName].dtype in ['int64', 'float64']]

# Copy only selected columns (low cardinality and numeric)
selected_cols = numeric_cols + low_cardinality_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()
X_test = X_test_full[selected_cols].copy()

X_train.head()

In [None]:
X_train_numeric = X_train_full[numeric_cols].copy()
X_valid_numeric = X_valid_full[numeric_cols].copy()
X_test_numeric = X_test_full[numeric_cols].copy()

X_train_categorical = X_train_full[low_cardinality_cols].copy()
X_valid_categorical = X_valid_full[low_cardinality_cols].copy()
X_test_categorical = X_test_full[low_cardinality_cols].copy()

# Categorical Variables

Drop Categorical: Remove all columns with non-numeric data.

One-Hot encoding: Add "one-hot" columns to represent different categories.

Ordinal: Assign each category a different numeric value.

## Drop categorical

Simply remove all columns with non-numeric data.

In [None]:
# A different way to select numeric columns than above
X_train_numeric = X_train.select_dtypes(exclude=['object'])
X_valid_numeric = X_train.select_dtypes(exclude=['object'])

## One-Hot encoding

A technique used to split numeric columns with multiple values into multiple columns with one column being "hot" (1) and the rest 0.

For example:
Size becomes Small, Medium, Large
|Size|Small|Medium|Large
|-|-|-|-|
|Small|1|0|0
|Large|0|0|1
|Medium|0|1|0

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding
OHEncoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OHEncoder.fit(X_train[low_cardinality_cols])
OH_train_cols = pd.DataFrame(OHEncoder.transform(X_train[low_cardinality_cols]))
OH_valid_cols = pd.DataFrame(OHEncoder.transform(X_valid[low_cardinality_cols]))

OH_train_cols.index = X_train.index
OH_valid_cols.index = X_valid.index

X_train_OH = pd.concat([X_train_numeric, OH_train_cols], axis=1)
X_valid_OH = pd.concat([X_valid_numeric, OH_valid_cols], axis=1)

In [None]:
# This is a very easy way to do one-hot encoding
X_train_OH = pd.get_dummies(X_train)
X_valid_OH = pd.get_dummies(X_valid)
X_test_OH = pd.get_dummies(X_test)
X_train_OH.head()

# Ordinal Encoding
Change categorical columns to numeric single-column.

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Collecting non-numeric columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Here we need to check if there are any unknown values in the validation set, do this by checking
# if validation set is a subset of training set
good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]

# Remove columns with unknown data
bad_label_cols = list(set(object_cols)-set(good_label_cols))
ordinal_X_train = X_train.drop(bad_label_cols, axis=1)
ordinal_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Create ordinal encoder and replace values in good_cols with the ordinal encoded values
ordinal_encoder = OrdinalEncoder()
ordinal_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
ordinal_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])
ordinal_X_train[good_label_cols].head()

In [None]:
# Binning
min_lotArea = X_train.LotArea.min()
max_lotArea = X_train.LotArea.max()
num_bins = 50
bins = np.linspace(min_lotArea, max_lotArea, num_bins)
labels = [str(i) for i in range(num_bins-1)]

lotArea_binned = pd.cut(X_train.LotArea, bins=bins, labels=labels, include_lowest=True)

In [None]:
# Data Imputation
imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.fit_transform(X_valid))