In [203]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

pd.options.display.max_columns = None

In [204]:
boston = pd.read_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/boston House price prediction/Date_set/train.csv"
)
test = pd.read_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/boston House price prediction/Date_set/test.csv"
)
combin = [boston, test]

# 1. Removing Redundant Features


In [205]:
boston.shape, test.shape

((1460, 81), (1459, 80))

##### 1.1. Remove columns with high missing values


In [206]:
uneeded_columns = []

for column in boston.columns:
    percentage = boston[column].isna().sum() * 100 / len(boston)
    if percentage > 30:
        uneeded_columns.append([column, percentage])

uneeded_columns

[['Alley', 93.76712328767124],
 ['MasVnrType', 59.726027397260275],
 ['FireplaceQu', 47.26027397260274],
 ['PoolQC', 99.52054794520548],
 ['Fence', 80.75342465753425],
 ['MiscFeature', 96.3013698630137]]

##### i will remove those columns from both data sets


In [207]:
drop = []
for i, j in uneeded_columns:
    drop.append(i)

for dataset in combin:
    dataset.drop(columns=drop, axis=1, inplace=True)

boston.shape, test.shape

((1460, 75), (1459, 74))

##### 1.2. Handling massing values


In [208]:
null_num_columns = []
null_cat_columns = []

for column in boston.columns:
    percentage = boston[column].isna().sum() * 100 / len(boston)
    if percentage > 0:
        if boston[column].dtype != "O":
            null_num_columns.append([column, percentage])
        else:
            null_cat_columns.append([column, percentage])

print(null_num_columns, "\n")
print(null_cat_columns)

[['LotFrontage', 17.73972602739726], ['MasVnrArea', 0.547945205479452], ['GarageYrBlt', 5.5479452054794525]] 

[['BsmtQual', 2.5342465753424657], ['BsmtCond', 2.5342465753424657], ['BsmtExposure', 2.6027397260273974], ['BsmtFinType1', 2.5342465753424657], ['BsmtFinType2', 2.6027397260273974], ['Electrical', 0.0684931506849315], ['GarageType', 5.5479452054794525], ['GarageFinish', 5.5479452054794525], ['GarageQual', 5.5479452054794525], ['GarageCond', 5.5479452054794525]]


In [209]:
num_columns = []
cat_columns = []
for i, j in null_num_columns:
    num_columns.append(i)

for i, j in null_cat_columns:
    cat_columns.append(i)


print(num_columns)
print()
print(cat_columns)

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']


##### Stratege:

if the missing values in the column less than 3%, so i will fill it with mean (for numerical) or mode (for categorical)

otherwise, i will use linear models or decision tree to fill the missing values


##### Numerical values (boston data set)


In [210]:
def fill_numerical_values_with_linear_model(dataset, column):

    data = dataset[[column, "SalePrice"]].copy()
    data[column] = data[column].fillna(-1)
    train = data[data[column] != -1]
    missied_data = pd.DataFrame(data[data[column] == -1]["SalePrice"])

    x_train, x_test, y_train, y_test = train_test_split(
        train.drop(columns=column, axis=1),
        train[column],
        train_size=0.01,
        random_state=42,
    )

    lin_reg = LinearRegression()
    lin_reg.fit(x_train, y_train)
    predction = list(lin_reg.predict(missied_data))

    def update(value):
        if value == -1:
            ret = int(predction[0])
            predction.pop(0)
            return ret
        return value

    dataset[column] = dataset[column].fillna(-1)
    dataset[column] = dataset[column].apply(update)

    return dataset[column]


def fill_numerical_values_with_mean(dataset=pd.DataFrame(), column=str):

    mean = dataset[column].mean()
    dataset[column] = dataset[column].fillna(mean)
    return dataset[column]


for column in num_columns:
    percentage = boston[column].isna().sum() * 100 / len(boston)

    if percentage <= 3:  # with mean
        boston[column] = fill_numerical_values_with_mean(boston, column)

    else:  # with model
        boston[column] = fill_numerical_values_with_linear_model(boston, column)

##### Categorical values (boston data set)


In [211]:
def fill_numerical_values_with_RF_model(dataset, column):
    data = dataset[[column, "SalePrice"]].copy()
    data[column] = data[column].fillna("missied_data")
    train = data[data[column] != "missied_data"]
    missied_data = pd.DataFrame(data[data[column] == "missied_data"]["SalePrice"])

    x_train, x_test, y_train, y_test = train_test_split(
        train.drop(columns=column, axis=1),
        train[column],
        train_size=0.01,
        random_state=42,
    )

    RF = RandomForestClassifier(ccp_alpha=0.015)
    RF.fit(x_train, y_train)
    predction = list(RF.predict(missied_data))

    def update(value):
        if value == "missied_data":
            ret = predction[0]
            predction.pop(0)
            return ret
        return value

    dataset[column] = dataset[column].fillna("missied_data")
    dataset[column] = dataset[column].apply(update)

    return dataset[column]


def fill_numerical_values_with_mode(dataset, column):
    mode = dataset[column].mode()[0]
    dataset[column] = dataset[column].fillna(mode)
    return dataset[column]


for column in cat_columns:
    percentage = boston[column].isna().sum() * 100 / len(boston)

    if percentage <= 3:  # with mean
        boston[column] = fill_numerical_values_with_mode(boston, column)

    else:  # with model
        boston[column] = fill_numerical_values_with_RF_model(boston, column)

##### Now lets fill data in test dataset


In [217]:
null_num_columns = []
null_cat_columns = []

for column in test.columns:
    percentage = test[column].isna().sum() * 100 / len(test)
    if percentage > 0:
        if test[column].dtype != "O":
            null_num_columns.append([column, percentage])
        else:
            null_cat_columns.append([column, percentage])

print(null_num_columns, "\n")
print(null_cat_columns)

[['LotFrontage', 15.558601782042494], ['MasVnrArea', 1.0281014393420151], ['BsmtFinSF1', 0.06854009595613433], ['BsmtFinSF2', 0.06854009595613433], ['BsmtUnfSF', 0.06854009595613433], ['TotalBsmtSF', 0.06854009595613433], ['BsmtFullBath', 0.13708019191226867], ['BsmtHalfBath', 0.13708019191226867], ['GarageYrBlt', 5.346127484578479], ['GarageCars', 0.06854009595613433], ['GarageArea', 0.06854009595613433]] 

[['MSZoning', 0.27416038382453733], ['Utilities', 0.13708019191226867], ['Exterior1st', 0.06854009595613433], ['Exterior2nd', 0.06854009595613433], ['BsmtQual', 3.015764222069911], ['BsmtCond', 3.0843043180260454], ['BsmtExposure', 3.015764222069911], ['BsmtFinType1', 2.8786840301576424], ['BsmtFinType2', 2.8786840301576424], ['KitchenQual', 0.06854009595613433], ['Functional', 0.13708019191226867], ['GarageType', 5.2090472926662095], ['GarageFinish', 5.346127484578479], ['GarageQual', 5.346127484578479], ['GarageCond', 5.346127484578479], ['SaleType', 0.06854009595613433]]


##### All columns has missing values with low percentage, so i will fill them with mean and mode


In [219]:
num_columns = []
cat_columns = []
for i, j in null_num_columns:
    num_columns.append(i)

for i, j in null_cat_columns:
    cat_columns.append(i)


print(num_columns)
print()
print(cat_columns)

['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']

['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType']


In [221]:
for column in cat_columns:
    test[column] = fill_numerical_values_with_mode(test, column)

for column in num_columns:
    test[column] = fill_numerical_values_with_mean(test, column)

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 74, dtype: int64

In [222]:
boston.shape, test.shape

((1460, 75), (1459, 74))