Based on the analysis done in the `eda.ipynb`, we can come up with the following data transformation which are required.

1) Handling NULL values
2) Handling Outliers
3) Handling categorical data
4) Feature Engineering if any
5) Dimentionality Reduction techniques

In [60]:
%%capture
%run eda.ipynb
# we can run other notebooks and use it's variables and imports

In [61]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats

In [62]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

### **Handling NULL values**

In [63]:
# For the records which are having significant NULL values, I am going to drop those columns
# getting null value counts
null_counts = train_data.isnull().sum()

print("Null Values in Training Data:")
print("="*30)
sorted_null_counts = null_counts[null_counts > 0].sort_values(ascending=False)
print(sorted_null_counts)

Null Values in Training Data:
PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
dtype: int64


In [64]:
# For the rest of the data I am going to impute the numerical data with mean and categorical data with the mode
def handle_null(data):
    data.drop(columns = ['PoolQC', 'Alley', 'MiscFeature', 'Fence', 'MasVnrType', 'FireplaceQu'], inplace = True, axis = 1)
    
    for column in data.columns:
        if data[column].dtype == 'float64' or data[column].dtype == 'int64':
            data[column].fillna(data[column].mean(), inplace=True)
        elif data[column].dtype == 'object':
            data[column].fillna(data[column].mode()[0], inplace=True)
            
    return data

In [65]:
train_data_cp = train_data.copy()
train_data_clean = handle_null(train_data_cp)

In [66]:
null_counts = train_data_clean.isnull().sum()

print("Null Values in Training Data:")
print("="*30)
sorted_null_counts = null_counts[null_counts > 0].sort_values(ascending=False)
print(sorted_null_counts)

Null Values in Training Data:
Series([], dtype: int64)


In [67]:
test_data_cp = test_data.copy()
test_data_clean = handle_null(test_data_cp)

### **Handling Outliers**

In [68]:
out_list_train = ['SalePrice', 'LotFrontage', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'GarageArea']

out_list_test = ['LotFrontage', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'GarageArea']

# using zscore to remove outiers
# the standard score is the number of standard deviations by which the value of a raw score is above or below the mean value of what is being observed or measured.
train_data_clean = train_data_clean[((np.abs(stats.zscore(train_data_clean[out_list_train])) < 3)).all(axis=1)]
test_data_clean = test_data_clean[((np.abs(stats.zscore(test_data_clean[out_list_test])) < 3)).all(axis=1)]

### **Handling skewed data**

In [69]:
def handle_skew_data(df, skew_threshold=1):
    numerical_features = df.select_dtypes(include=[np.number])
    
    skewness = numerical_features.apply(lambda x: stats.skew(x.dropna()))
    skewed_features = skewness[skewness > skew_threshold].index
    df_transformed = df.copy()
    df_transformed[skewed_features] = np.log1p(df[skewed_features])
    return df_transformed


In [70]:
train_data_clean = handle_skew_data(train_data_clean)
test_data_clean = handle_skew_data(test_data_clean)

### **Feature Selection**

In [71]:
# selecting only few features based on the correlation matrix we have got
wanted_feat = ['OverallQual', 'GrLivArea', '2ndFlrSF', 'GarageCars', 'TotalBsmtSF', 'GarageArea', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'BsmtFinSF1', 'SalePrice']
unwanted_feat = list(set(train_data_clean.select_dtypes(include=["float64", "int64"]).columns.to_list()) - set(wanted_feat))
train_data_clean.drop(columns= unwanted_feat, axis=1, inplace=True)
test_data_clean.drop(columns= unwanted_feat, axis=1, inplace=True)

### **Encoding Data**

In [72]:
# Label encoding - for xgboost and decision trees
def label_encode_categorical(data):
    data_encoded = data.copy()

    for col in data.columns:
        if data[col].dtype == 'object':
            data_encoded[col] = LabelEncoder().fit_transform(data[col])

    return data_encoded

In [73]:
train_data_label_encoded = label_encode_categorical(train_data_clean)
test_data_label_encoded = label_encode_categorical(test_data_clean)

In [87]:
# one-hot encoding - for linear
def one_hot_encode_categorical(data):
    print('before get_dummies() shape:', data.shape)
    df_ohe = pd.get_dummies(data)
    print('after get_dummies() shape:', df_ohe.shape)
    pd.set_option("display.max_columns",300)
    df_ohe.reset_index(drop = True)
    return df_ohe

In [88]:
train_data_one_hot_encoded = one_hot_encode_categorical(train_data_clean)
test_data_one_hot_encoded = one_hot_encode_categorical(test_data_clean)

before get_dummies() shape: (1407, 51)
after get_dummies() shape: (1407, 242)
before get_dummies() shape: (1413, 50)
after get_dummies() shape: (1413, 226)


In [89]:
train_data_label_encoded.to_csv("./data/label_encoded_train.csv", index=False)
test_data_label_encoded.to_csv("./data/label_encoded_test.csv", index=False)

In [90]:
train_data_one_hot_encoded.to_csv("./data/onehot_encoded_train.csv", index=False)
test_data_one_hot_encoded.to_csv("./data/onehot_encoded_test.csv", index=False)