In [12]:
# Importing the required modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [13]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Removing the sale price column from training data to process training and testing data together
df_final_train = pd.DataFrame({'SalePrice':df_train['SalePrice']})
df_train = df_train.drop('SalePrice', axis = 1)

# Creating a combined dataframe
df_combined = pd.concat([df_train, df_test], axis=0)

df_train.shape, df_test.shape, df_combined.shape, df_final_train.shape

((1460, 80), (1459, 80), (2919, 80), (1460, 1))

In [14]:
# Removing ID column
df_combined = df_combined.drop('Id', axis = 1)

df_combined.shape

(2919, 79)

In [15]:
# Checking for null values
percent_na_columns = df_combined.isnull().mean()

# Finding the column names of those having missing values greater than 10%
missing_columns = percent_na_columns[percent_na_columns>0.1].index

df_combined = df_combined.drop(missing_columns, axis = 1)

df_combined.shape

(2919, 72)

In [16]:
# Filling categorical missing data with mode for columns having less null values
categorical_columns = df_combined.select_dtypes(include=['object', 'category']).columns

na_sum = df_combined[categorical_columns].isnull().sum()
columns_below_threshold = na_sum[na_sum < 20].index
columns_above_threshold = pd.Series(na_sum[na_sum >= 20].index)

df_combined[columns_below_threshold] = df_combined[columns_below_threshold].fillna(df_combined[columns_below_threshold].mode().iloc[0])

# Filling non-categorical missing data with mean for columns having less null values
numerical_columns = df_combined.select_dtypes(include=['int', 'float']).columns

na_sum = df_combined[numerical_columns].isnull().sum()
num_columns_below_threshold = na_sum[na_sum < 20].index
num_columns_above_threshold = pd.Series(na_sum[na_sum >= 20].index)

df_combined[num_columns_below_threshold] = df_combined[num_columns_below_threshold].fillna(df_combined[num_columns_below_threshold].mean().iloc[0])

above_threshold = pd.Index(pd.concat([columns_above_threshold, num_columns_above_threshold], axis = 0))

In [17]:
# Filling all else data using k nearest neighbours
df_encoded = df_combined.copy()

encoder = LabelEncoder()
for col in categorical_columns:
    df_encoded[col] = encoder.fit_transform(df_combined[col].astype(str))
    
imputer = KNNImputer(n_neighbors = 3)

df_combined = pd.DataFrame(imputer.fit_transform(df_encoded), columns=df_encoded.columns)

for col in categorical_columns:
    df_combined[col] = df_combined[col].astype(int).astype('category').cat.categories[df_combined[col].astype(int)]

df_combined.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60.0,3,8450.0,1,3,3,0,4,0,5,...,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,8,4
1,20.0,3,9600.0,1,3,3,0,2,0,24,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,8,4
2,60.0,3,11250.0,1,0,3,0,4,0,5,...,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,8,4
3,70.0,3,9550.0,1,0,3,0,0,0,6,...,35.0,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,8,0
4,60.0,3,14260.0,1,0,3,0,2,0,15,...,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,8,4


In [18]:
# OneHotEncoding categorical columns and normalising data from numerical columns
df_combined = pd.get_dummies(df_combined, columns=categorical_columns)

df_combined[numerical_columns] = np.log1p(df_combined[numerical_columns])
df_combined[numerical_columns] = (df_combined[numerical_columns] - df_combined[numerical_columns].mean()) / (df_combined[numerical_columns].std())

In [19]:
similarity_threshold = 1
similar_columns = []

similar_columns = []
for i in range(df_combined.shape[1]):
    for j in range(i+1, df_combined.shape[1]):
        similarity = np.mean(df_combined.iloc[:, i] == df_combined.iloc[:, j])
        if similarity >= similarity_threshold:
            similar_columns.append(df_combined.columns[j])

similar_columns = set(similar_columns)

df_combined = df_combined.drop(similar_columns, axis =1)


In [20]:
df_train = df_combined.loc[:1459]
df_test_final = df_combined.loc[1460:]

print(df_final_train.shape)

df_final_train = pd.concat([df_train, df_final_train], axis = 1)

print(df_final_train.shape)

df_final_train.to_csv('final_train.csv', index = False)
df_test_final.to_csv('final_test.csv', index = False)

(1460, 1)
(1460, 273)
