# Examples for Paper

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
house = pd.concat([train.iloc[:, range(1,80)], test.iloc[:, range(1,80)]]) 
house = house.reset_index(drop=True)

## Unrealistic and Missing Data

In [4]:
# Unrealistic data example in the feature GarageYrBlt.
GarageYrBlt_unrealistic_data = house.loc[2592, "GarageYrBlt"]
print(f"Example of unrealistic data in the feature GarageYrBlt: The year {GarageYrBlt_unrealistic_data}.")

Example of unrealistic data in the feature GarageYrBlt: The year 2207.0.


In [5]:
# The year 2007 makes sense.
GarageYrBlt_realistic_data = house.loc[2592, "GarageYrBlt"] = 2007
print(f"The unrealistic year {GarageYrBlt_unrealistic_data} is now the realistic year {GarageYrBlt_realistic_data}.")

The unrealistic year 2207.0 is now the realistic year 2007.


In [6]:
# Null values in the quantitative feature GarageYrBlt.
GarageYrBlt_NA = np.sum(house["GarageYrBlt"].isnull().sum())
print(f"There are {GarageYrBlt_NA} null values in the feature quantitative feature GarageYrBlt.")

There are 159 null values in the feature quantitative feature GarageYrBlt.


In [7]:
# Replaces the null values with the mean of the feature.
mean_value = house["GarageYrBlt"].mean()
house["GarageYrBlt"] = house["GarageYrBlt"].mask(house["GarageYrBlt"].isnull(), mean_value)

In [8]:
# Null values in the feature GarageYrBlt.
GarageYrBlt_NA_after = np.sum(house["GarageYrBlt"].isnull().sum())
print(f"There are {GarageYrBlt_NA_after} null values in the quantitative feature GarageYrBlt.")

There are 0 null values in the quantitative feature GarageYrBlt.


In [9]:
print(f"Mean: {mean_value}.")

Mean: 1978.0409420289855.


In [10]:
# The mean doesn't take the null values into consideration, so it is the mean of the values that are not null.
sum = 0
number = 0
for i in house["GarageYrBlt"]:
    if i != 0:
        sum += i
        number += 1
    else:
        continue

print("Mean:", sum/number)

Mean: 1978.0409420289902


In [11]:
GarageType_NA = np.sum(house["GarageType"].isnull().sum())
print(f"There are {GarageType_NA} null values in the qualitative feature GarageType.")

There are 157 null values in the qualitative feature GarageType.


## Qualitative to Quantitative Data

In [12]:
house["GarageType_encoded"], _ = pd.factorize(house["GarageType"])

In [13]:
# Replaces the null values with the mode of the feature.
mode_value = house["GarageType"].mode()[0]

new_values = []
for i in house["GarageType"]:
    if pd.isnull(i): # .isnull() determines whether a value is null
        new_values.append(mode_value) # .append() adds element to list
    else:
        new_values.append(i)

house["GarageType"] = new_values

In [14]:
# Null values in the feature GarageYrBlt.
GarageType_NA_after = np.sum(house["GarageType"].isnull().sum())
print(f"There are {GarageType_NA_after} null values in the qualitative feature GarageType.")

There are 0 null values in the qualitative feature GarageType.


In [15]:
print(mode_value)

Attchd


In [16]:
house["GarageType_encoded"], _ = pd.factorize(house["GarageType"])
house.drop(columns="GarageType", inplace=True)