# What to try in this notebook?

#### 1. Get a random number generated dataset from kaggle, use one column and create missing (1%, 5%, 10%), scale values, apply KNN, MEAN imputation. Compare the results and compute mean() and var() for the list of differences between org. and Imputed value 

Dataset - https://www.kaggle.com/timoboz/random-numbers

#### 2. Use a housing dataset from UCI, use one column and create missing (1%, 5%, 10%), scale values, apply KNN, MEAN imputation. Compare the results and compute mean() and var() for the list of differences between org. and Imputed value 

Dataset - https://github.com/nikbearbrown/AI_Research_Group/blob/main/Awesome-UCI-Datasets/Classification/House_Price_predication/train.csv

In [101]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# 1.1 Random Numbers dataset

In [102]:
random_dataset = pd.read_csv('random_numbers_1000.csv')

In [103]:
random_dataset.sample(10)

Unnamed: 0.1,Unnamed: 0,number
782,782,0.955151
378,378,0.310217
542,542,0.607177
80,80,0.861696
282,282,0.204316
976,976,0.059688
924,924,0.372837
329,329,0.406915
131,131,0.40242
607,607,0.078909


In [104]:
random_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1000 non-null   int64  
 1   number      1000 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 15.8 KB


In [105]:
random_dataset['number'].describe()

count    1000.000000
mean        0.490463
std         0.284669
min         0.000068
25%         0.252124
50%         0.479825
75%         0.735584
max         0.997610
Name: number, dtype: float64

#### Create 3 col. for numbers for 1%, 5% and 10% missing data

In [106]:
df_number = random_dataset[['number']]
df_number['number_copy_1_percent'] = df_number[['number']]
df_number['number_copy_5_percent'] = df_number[['number']]
df_number['number_copy_10_percent'] = df_number[['number']]
df_number

Unnamed: 0,number,number_copy_1_percent,number_copy_5_percent,number_copy_10_percent
0,0.144616,0.144616,0.144616,0.144616
1,0.077515,0.077515,0.077515,0.077515
2,0.155933,0.155933,0.155933,0.155933
3,0.097209,0.097209,0.097209,0.097209
4,0.323750,0.323750,0.323750,0.323750
...,...,...,...,...
995,0.182107,0.182107,0.182107,0.182107
996,0.787988,0.787988,0.787988,0.787988
997,0.148707,0.148707,0.148707,0.148707
998,0.153121,0.153121,0.153121,0.153121


#### Check % missing values in this dataframe

In [107]:
def get_percent_missing(dataframe):
    
    percent_missing = dataframe.isnull().sum() * 100 / len(dataframe)
    missing_value_df = pd.DataFrame({'column_name': dataframe.columns,
                                     'percent_missing': percent_missing})
    return missing_value_df

In [108]:
print(get_percent_missing(df_number))

                                   column_name  percent_missing
number                                  number              0.0
number_copy_1_percent    number_copy_1_percent              0.0
number_copy_5_percent    number_copy_5_percent              0.0
number_copy_10_percent  number_copy_10_percent              0.0


#### Create missing helper fn

In [109]:
def create_missing(dataframe, percent, col):
    dataframe.loc[dataframe.sample(frac = percent).index, col] = np.nan

#### Create missing data in each col

In [110]:
create_missing(df_number, 0.01, 'number_copy_1_percent')
create_missing(df_number, 0.05, 'number_copy_5_percent')
create_missing(df_number, 0.1, 'number_copy_10_percent')

#### Check % missing after removing data

In [111]:
print(get_percent_missing(df_number))

                                   column_name  percent_missing
number                                  number              0.0
number_copy_1_percent    number_copy_1_percent              1.0
number_copy_5_percent    number_copy_5_percent              5.0
number_copy_10_percent  number_copy_10_percent             10.0


#### Store the indices of missing rows

In [112]:
# Store Index of NaN values in each coloumns
number_1_idx = list(np.where(df_number['number_copy_1_percent'].isna())[0])
number_5_idx = list(np.where(df_number['number_copy_5_percent'].isna())[0])
number_10_idx = list(np.where(df_number['number_copy_10_percent'].isna())[0])

In [113]:
print(f"Length of number_1_idx is {len(number_1_idx)} and it contains {(len(number_1_idx)/len(df_number['number_copy_1_percent']))*100}% of total data in column | Total rows: {len(df_number['number_copy_1_percent'])}")
print(f"Length of number_5_idx is {len(number_5_idx)} and it contains {(len(number_5_idx)/len(df_number['number_copy_1_percent']))*100}% of total data in column | Total rows: {len(df_number['number_copy_1_percent'])}")
print(f"Length of number_10_idx is {len(number_10_idx)} and it contains {(len(number_10_idx)/len(df_number['number_copy_1_percent']))*100}% of total data in column | Total rows: {len(df_number['number_copy_1_percent'])}")

Length of number_1_idx is 10 and it contains 1.0% of total data in column | Total rows: 1000
Length of number_5_idx is 50 and it contains 5.0% of total data in column | Total rows: 1000
Length of number_10_idx is 100 and it contains 10.0% of total data in column | Total rows: 1000


### Perform KNN impute to df_number dataframe

In [114]:
df_number1 = df_number.copy(deep=True)
imputer = KNNImputer(n_neighbors=5)
imputed_number_df = pd.DataFrame(imputer.fit_transform(df_number1), columns = df_number1.columns)


In [115]:
imputed_number_df.sample(10)

Unnamed: 0,number,number_copy_1_percent,number_copy_5_percent,number_copy_10_percent
347,0.372389,0.372389,0.372389,0.372389
934,0.327766,0.327766,0.327766,0.327766
927,0.753892,0.753892,0.753892,0.753892
997,0.148707,0.148707,0.148707,0.148707
167,0.730901,0.730901,0.730901,0.730901
914,0.84133,0.84133,0.84133,0.84133
432,0.897466,0.897466,0.897466,0.897466
587,0.411685,0.411685,0.411685,0.411685
884,0.378794,0.378794,0.378794,0.378794
379,0.265429,0.265429,0.265429,0.264843


#### Check the % missing data in dataframe now

In [116]:
print(get_percent_missing(imputed_number_df))

                                   column_name  percent_missing
number                                  number              0.0
number_copy_1_percent    number_copy_1_percent              0.0
number_copy_5_percent    number_copy_5_percent              0.0
number_copy_10_percent  number_copy_10_percent              0.0


#### Store the list of differences between org. and Imputed value

In [117]:
# create list of difference bwtween imputed and orginal value

number_diff_1 = []
number_diff_5 = []
number_diff_10 = []
count = 0

for i in number_1_idx:
    count +=1
    diff1 = abs(imputed_number_df['number_copy_1_percent'][i] - df_number1['number'][i])
    number_diff_1.append(diff1)
    

for i in number_5_idx:
    diff5 = abs(imputed_number_df['number_copy_5_percent'][i] - df_number1['number'][i])
    number_diff_5.append(diff5)

for i in number_10_idx:
    diff10 = abs(imputed_number_df['number_copy_10_percent'][i] - df_number1['number'][i])
    number_diff_10.append(diff10)

In [118]:
print(len(number_diff_1))
print(len(number_diff_5))
print(len(number_diff_10))

10
50
100


### Calculate the mean and varience of list of differences KNN

In [119]:
m1 = sum(number_diff_1) / len(number_diff_1)

# calculate variance using a list comprehension
var_res1 = sum((xi - m1) ** 2 for xi in number_diff_1) / len(number_diff_1)

m5 = sum(number_diff_5) / len(number_diff_5)

# calculate variance using a list comprehension
var_res5 = sum((xii - m5) ** 2 for xii in number_diff_5) / len(number_diff_5)


m10 = sum(number_diff_10) / len(number_diff_10)

# calculate variance using a list comprehension
var_res10 = sum((xiii - m10) ** 2 for xiii in number_diff_10) / len(number_diff_10)

print(f"The mean of 1% is {m1} and varience 1% is {var_res1}")
print(f"The mean of 5% is {m5} and varience 5% is {var_res5}")
print(f"The mean of 10% is {m5} and varience 10% is {var_res10}")

The mean of 1% is 0.0007902710470742466 and varience 1% is 4.5687016451605466e-07
The mean of 5% is 0.000675654857997236 and varience 5% is 3.072444468179742e-07
The mean of 10% is 0.000675654857997236 and varience 10% is 2.480608628449602e-07


In [120]:
df_knn_number = pd.DataFrame.from_dict({'1%_number': [m1, var_res1],
 '5%_number': [m5, var_res5],
 '10%_number': [m10, var_res10]}, orient='index')
df_knn_number.columns=['diff. list Mean(KNN)', 'diff. list Var.(KNN)']

### Perform MEAN based imputation

#### Before mean imputation % missing

In [121]:
df_number2 = df_number.copy(deep=True)
print(get_percent_missing(df_number2))

                                   column_name  percent_missing
number                                  number              0.0
number_copy_1_percent    number_copy_1_percent              1.0
number_copy_5_percent    number_copy_5_percent              5.0
number_copy_10_percent  number_copy_10_percent             10.0


In [122]:
df_number2['number_copy_1_percent'] = df_number2['number_copy_1_percent'].fillna(df_number2['number_copy_1_percent'].mean())
df_number2['number_copy_5_percent'] = df_number2['number_copy_5_percent'].fillna(df_number2['number_copy_5_percent'].mean())
df_number2['number_copy_10_percent'] = df_number2['number_copy_10_percent'].fillna(df_number2['number_copy_10_percent'].mean())

#### After mean impute % missing 

In [123]:
print(get_percent_missing(df_number2))

                                   column_name  percent_missing
number                                  number              0.0
number_copy_1_percent    number_copy_1_percent              0.0
number_copy_5_percent    number_copy_5_percent              0.0
number_copy_10_percent  number_copy_10_percent              0.0


In [124]:
df_number2.sample(10)

Unnamed: 0,number,number_copy_1_percent,number_copy_5_percent,number_copy_10_percent
366,0.425525,0.425525,0.425525,0.425525
145,0.246589,0.246589,0.246589,0.246589
538,0.503701,0.503701,0.503701,0.503701
256,0.118901,0.118901,0.491932,0.118901
156,0.773215,0.773215,0.773215,0.773215
500,0.441087,0.441087,0.441087,0.441087
325,0.095068,0.095068,0.095068,0.095068
97,0.209842,0.209842,0.209842,0.487348
905,0.117657,0.491084,0.117657,0.117657
251,0.961305,0.961305,0.961305,0.961305


#### Create a list of difference -  MEAN

In [125]:
# create list of difference bwtween imputed and orginal value

number_diff_1_mean = []
number_diff_5_mean = []
number_diff_10_mean = []
count = 0

for i in number_1_idx:
    count +=1
    diff1 = abs(df_number2['number_copy_1_percent'][i] - df_number2['number'][i])
    number_diff_1_mean.append(diff1)
    

for i in number_5_idx:
    diff5 = abs(df_number2['number_copy_5_percent'][i] - df_number2['number'][i])
    number_diff_5_mean.append(diff5)

for i in number_10_idx:
    diff10 = abs(df_number2['number_copy_10_percent'][i] - df_number2['number'][i])
    number_diff_10_mean.append(diff10)

In [126]:
print(len(number_diff_1_mean))
print(len(number_diff_5_mean))
print(len(number_diff_10_mean))

10
50
100


### Calculate the mean and var of the list of differences - MEAN Impute

In [127]:
m1 = sum(number_diff_1_mean) / len(number_diff_1_mean)

# calculate variance using a list comprehension
var_res1 = sum((xi - m1) ** 2 for xi in number_diff_1_mean) / len(number_diff_1_mean)

m5 = sum(number_diff_5_mean) / len(number_diff_5_mean)

# calculate variance using a list comprehension
var_res5 = sum((xii - m5) ** 2 for xii in number_diff_5_mean) / len(number_diff_5_mean)


m10 = sum(number_diff_10_mean) / len(number_diff_10_mean)

# calculate variance using a list comprehension
var_res10 = sum((xiii - m10) ** 2 for xiii in number_diff_10_mean) / len(number_diff_10_mean)

print(f"The mean of 1% is {m1} and varience 1% is {var_res1}")
print(f"The mean of 5% is {m5} and varience 5% is {var_res5}")
print(f"The mean of 10% is {m5} and varience 10% is {var_res10}")

The mean of 1% is 0.269368727544059 and varience 1% is 0.018130331928686818
The mean of 5% is 0.18484105170274112 and varience 5% is 0.014920933643125705
The mean of 10% is 0.18484105170274112 and varience 10% is 0.020023889816061954


In [128]:
df_MI_number = pd.DataFrame.from_dict({'1%_number': [m1, var_res1],
 '5%_number': [m5, var_res5],
 '10%_number': [m10, var_res10]}, orient='index')
df_MI_number.columns=['diff. list Mean(MI)', 'diff. list Var.(MI)']

## KNN and MEAN columns side by side

In [129]:
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [130]:
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [131]:
# https://github.com/epmoyer/ipy_table/issues/24

from IPython.core.display import HTML

def multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

In [132]:
print(number_1_idx[:5])

[124, 257, 309, 313, 405]


In [133]:
compare_1 = imputed_number_df.loc[:, ["number", "number_copy_1_percent"]]
compare_5 = imputed_number_df.loc[:, ["number", "number_copy_5_percent"]]
compare_10 = imputed_number_df.loc[:, ["number", "number_copy_10_percent"]]

In [134]:
compare_1_df =  compare_1.iloc[number_1_idx]
compare_5_df =  compare_5.iloc[number_5_idx]
compare_10_df =  compare_10.iloc[number_10_idx]

In [135]:
compare_1_mean = df_number2.loc[:, ["number", "number_copy_1_percent"]]
compare_5_mean = df_number2.loc[:, ["number", "number_copy_5_percent"]]
compare_10_mean = df_number2.loc[:, ["number", "number_copy_10_percent"]]

In [136]:
compare_1_mean_df =  compare_1_mean.iloc[number_1_idx]
compare_5_mean_df =  compare_5_mean.iloc[number_5_idx]
compare_10_mean_df =  compare_10_mean.iloc[number_10_idx]

In [137]:
# display_side_by_side(compare_1_df.head(), compare_1_mean_df.head(), titles=['number 1% KNN Impute','number 1% Mean Impute'])
# display_side_by_side(compare_5_df.head(), compare_5_mean_df.head(), titles=['number 5% KNN Impute','number 5% Mean Impute'])
# display_side_by_side(compare_10_df.head(), compare_10_mean_df.head(), titles=['number 10% KNN Impute','number 10% Mean Impute'])

#### **number 1% KNN Impute VS number 1% Mean Impute**

In [138]:
multi_table([compare_1_df.head(), compare_1_mean_df.head()])

Unnamed: 0_level_0,number,number_copy_1_percent
Unnamed: 0_level_1,number,number_copy_1_percent
124,0.192990,0.192926
257,0.065602,0.066172
309,0.661447,0.663769
313,0.963951,0.962988
405,0.627460,0.627545
124,0.192990,0.491084
257,0.065602,0.491084
309,0.661447,0.491084
313,0.963951,0.491084
405,0.627460,0.491084

Unnamed: 0,number,number_copy_1_percent
124,0.19299,0.192926
257,0.065602,0.066172
309,0.661447,0.663769
313,0.963951,0.962988
405,0.62746,0.627545

Unnamed: 0,number,number_copy_1_percent
124,0.19299,0.491084
257,0.065602,0.491084
309,0.661447,0.491084
313,0.963951,0.491084
405,0.62746,0.491084


#### **number 5% KNN Impute VS number 5% Mean Impute**

In [139]:
multi_table([compare_5_df.head(), compare_5_mean_df.head()])

Unnamed: 0_level_0,number,number_copy_5_percent
Unnamed: 0_level_1,number,number_copy_5_percent
54,0.440144,0.439307
59,0.189655,0.191045
72,0.411451,0.412386
78,0.205178,0.204306
107,0.323097,0.322044
54,0.440144,0.491932
59,0.189655,0.491932
72,0.411451,0.491932
78,0.205178,0.491932
107,0.323097,0.491932

Unnamed: 0,number,number_copy_5_percent
54,0.440144,0.439307
59,0.189655,0.191045
72,0.411451,0.412386
78,0.205178,0.204306
107,0.323097,0.322044

Unnamed: 0,number,number_copy_5_percent
54,0.440144,0.491932
59,0.189655,0.491932
72,0.411451,0.491932
78,0.205178,0.491932
107,0.323097,0.491932


#### **number 10% KNN Impute VS number 10% Mean Impute**

In [140]:
multi_table([compare_10_df.head(), compare_10_mean_df.head()])

Unnamed: 0_level_0,number,number_copy_10_percent
Unnamed: 0_level_1,number,number_copy_10_percent
22,0.798188,0.798777
47,0.861454,0.861385
49,0.445108,0.446055
68,0.557468,0.557299
69,0.231172,0.230069
22,0.798188,0.487348
47,0.861454,0.487348
49,0.445108,0.487348
68,0.557468,0.487348
69,0.231172,0.487348

Unnamed: 0,number,number_copy_10_percent
22,0.798188,0.798777
47,0.861454,0.861385
49,0.445108,0.446055
68,0.557468,0.557299
69,0.231172,0.230069

Unnamed: 0,number,number_copy_10_percent
22,0.798188,0.487348
47,0.861454,0.487348
49,0.445108,0.487348
68,0.557468,0.487348
69,0.231172,0.487348


# 1.2 Random Numbers dataset Results - KNN and MEAN

In [142]:
results = pd.concat([df_knn_number, df_MI_number])

In [143]:
results

Unnamed: 0,diff. list Mean(KNN),diff. list Var.(KNN),diff. list Mean(MI),diff. list Var.(MI)
1%_number,0.00079,4.568702e-07,,
5%_number,0.000676,3.072444e-07,,
10%_number,0.000648,2.480609e-07,,
1%_number,,,0.269369,0.01813
5%_number,,,0.184841,0.014921
10%_number,,,0.231501,0.020024


In [144]:
results.to_csv('random_num_knn_mean_results.csv')

# 2.1 Housing Dataset 

In [361]:
housing_data = pd.read_csv('https://raw.githubusercontent.com/nikbearbrown/AI_Research_Group/main/Awesome-UCI-Datasets/Classification/House_Price_predication/train.csv')

In [362]:
housing_data.sample(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
820,821,60,RL,72.0,7226,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,183000
1390,1391,20,RL,70.0,9100,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Normal,235000
535,536,190,RL,70.0,7000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,107500
1236,1237,160,RL,36.0,2628,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,175500
1337,1338,30,RM,153.0,4118,Pave,Grvl,IR1,Bnk,AllPub,...,0,,,,0,3,2006,WD,Normal,52500
674,675,20,RL,80.0,9200,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2008,WD,Normal,140000
604,605,20,RL,88.0,12803,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,221000
605,606,60,RL,85.0,13600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2009,WD,Normal,205000
1218,1219,50,RM,52.0,6240,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,80500
882,883,60,RL,,9636,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,12,2009,WD,Normal,178000


In [363]:
housing_data['SalePrice'].nunique()

663

In [364]:
housing_data['LotArea'].nunique()

1073

In [365]:
housing_data['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [366]:
housing_data['LotArea'].describe()

count      1460.000000
mean      10516.828082
std        9981.264932
min        1300.000000
25%        7553.500000
50%        9478.500000
75%       11601.500000
max      215245.000000
Name: LotArea, dtype: float64

In [367]:
pd.set_option('display.max_rows', None)
print(get_percent_missing(housing_data))

                 column_name  percent_missing
Id                        Id         0.000000
MSSubClass        MSSubClass         0.000000
MSZoning            MSZoning         0.000000
LotFrontage      LotFrontage        17.739726
LotArea              LotArea         0.000000
Street                Street         0.000000
Alley                  Alley        93.767123
LotShape            LotShape         0.000000
LandContour      LandContour         0.000000
Utilities          Utilities         0.000000
LotConfig          LotConfig         0.000000
LandSlope          LandSlope         0.000000
Neighborhood    Neighborhood         0.000000
Condition1        Condition1         0.000000
Condition2        Condition2         0.000000
BldgType            BldgType         0.000000
HouseStyle        HouseStyle         0.000000
OverallQual      OverallQual         0.000000
OverallCond      OverallCond         0.000000
YearBuilt          YearBuilt         0.000000
YearRemodAdd    YearRemodAdd      

#### Using Sale price coloumn for KNN and MEAN imputation task

#### Non Scaled dataframe Sale Price - take first 1000 rows

In [368]:
df_saleprice = housing_data[['SalePrice']][:1000]
df_saleprice['sp_copy_1_percent'] = df_saleprice[['SalePrice']]
df_saleprice['sp_copy_5_percent'] = df_saleprice[['SalePrice']]
df_saleprice['sp_copy_10_percent'] = df_saleprice[['SalePrice']]
df_saleprice.head()

Unnamed: 0,SalePrice,sp_copy_1_percent,sp_copy_5_percent,sp_copy_10_percent
0,208500,208500,208500,208500
1,181500,181500,181500,181500
2,223500,223500,223500,223500
3,140000,140000,140000,140000
4,250000,250000,250000,250000


In [369]:
len(df_saleprice)

1000

#### Scaled Dataframe SalePrice - take first 1000 rows

In [370]:
scaler = MinMaxScaler()
df_saleprice_scaled = df_saleprice.copy(deep=True)
df_saleprice_scaled = pd.DataFrame(scaler.fit_transform(df_saleprice_scaled), columns = df_saleprice_scaled.columns)
df_saleprice_scaled.head()

Unnamed: 0,SalePrice,sp_copy_1_percent,sp_copy_5_percent,sp_copy_10_percent
0,0.241078,0.241078,0.241078,0.241078
1,0.203583,0.203583,0.203583,0.203583
2,0.261908,0.261908,0.261908,0.261908
3,0.145952,0.145952,0.145952,0.145952
4,0.298709,0.298709,0.298709,0.298709


#### Check % missing values in this dataframe

In [371]:
print(get_percent_missing(df_saleprice))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              0.0
sp_copy_5_percent    sp_copy_5_percent              0.0
sp_copy_10_percent  sp_copy_10_percent              0.0


#### Create 1%, 5% and 10% missing data

In [372]:
create_missing(df_saleprice, 0.01, 'sp_copy_1_percent')
create_missing(df_saleprice, 0.05, 'sp_copy_5_percent')
create_missing(df_saleprice, 0.1, 'sp_copy_10_percent')

In [373]:
create_missing(df_saleprice_scaled, 0.01, 'sp_copy_1_percent')
create_missing(df_saleprice_scaled, 0.05, 'sp_copy_5_percent')
create_missing(df_saleprice_scaled, 0.1, 'sp_copy_10_percent')

#### With/Without scaling dataframe missing values check

In [374]:
print(get_percent_missing(df_saleprice))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              1.0
sp_copy_5_percent    sp_copy_5_percent              5.0
sp_copy_10_percent  sp_copy_10_percent             10.0


In [375]:
print(get_percent_missing(df_saleprice_scaled))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              1.0
sp_copy_5_percent    sp_copy_5_percent              5.0
sp_copy_10_percent  sp_copy_10_percent             10.0


In [376]:
df_saleprice['sp_copy_1_percent'].isna().sum()

10

#### Store indices of missing values

In [377]:
# Store Index of NaN values in each coloumns
sp_1_idx = list(np.where(df_saleprice['sp_copy_1_percent'].isna())[0])
sp_5_idx = list(np.where(df_saleprice['sp_copy_5_percent'].isna())[0])
sp_10_idx = list(np.where(df_saleprice['sp_copy_10_percent'].isna())[0])

In [378]:
print(len(sp_1_idx))
print(len(sp_5_idx))
print(len(sp_10_idx))

10
50
100


In [379]:
print(f"Length of sp_1_idx is {len(sp_1_idx)} and it contains {(len(sp_1_idx)/len(df_saleprice['sp_copy_1_percent']))*100}% of total data in column | Total rows: {len(df_saleprice['sp_copy_1_percent'])}")
print(f"Length of sp_5_idx is {len(sp_5_idx)} and it contains {(len(sp_5_idx)/len(df_saleprice['sp_copy_5_percent']))*100}% of total data in column | Total rows: {len(df_saleprice['sp_copy_1_percent'])}")
print(f"Length of sp_10_idx is {len(sp_10_idx)} and it contains {(len(sp_10_idx)/len(df_saleprice['sp_copy_10_percent']))*100}% of total data in column | Total rows: {len(df_saleprice['sp_copy_1_percent'])}")

Length of sp_1_idx is 10 and it contains 1.0% of total data in column | Total rows: 1000
Length of sp_5_idx is 50 and it contains 5.0% of total data in column | Total rows: 1000
Length of sp_10_idx is 100 and it contains 10.0% of total data in column | Total rows: 1000


### Perform KNN to df_saleprice and df_saleprice_scaled dataframe

In [380]:
df_saleprice1 = df_saleprice.copy(deep=True)
imputer = KNNImputer(n_neighbors=5)
imputed_saleprice_df = pd.DataFrame(imputer.fit_transform(df_saleprice1), columns = df_saleprice1.columns)

In [381]:
df_saleprice_scaled1 = df_saleprice_scaled.copy(deep=True)
imputer = KNNImputer(n_neighbors=5)
imputed_saleprice_scaled_df = pd.DataFrame(imputer.fit_transform(df_saleprice_scaled1), columns = df_saleprice_scaled1.columns)

In [382]:
imputed_saleprice_df.head()

Unnamed: 0,SalePrice,sp_copy_1_percent,sp_copy_5_percent,sp_copy_10_percent
0,208500.0,208500.0,208500.0,208500.0
1,181500.0,181500.0,181500.0,181500.0
2,223500.0,223500.0,223500.0,223500.0
3,140000.0,140000.0,140000.0,140000.0
4,250000.0,250000.0,250000.0,250000.0


In [383]:
imputed_saleprice_scaled_df.head()

Unnamed: 0,SalePrice,sp_copy_1_percent,sp_copy_5_percent,sp_copy_10_percent
0,0.241078,0.241078,0.240855,0.241078
1,0.203583,0.203583,0.203583,0.203583
2,0.261908,0.261908,0.261908,0.261908
3,0.145952,0.145952,0.145952,0.145952
4,0.298709,0.298709,0.298709,0.298709


#### Check % missing in saleprice and saleprice_scaled DF

In [384]:
print(get_percent_missing(imputed_saleprice_df))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              0.0
sp_copy_5_percent    sp_copy_5_percent              0.0
sp_copy_10_percent  sp_copy_10_percent              0.0


In [385]:
print(get_percent_missing(imputed_saleprice_scaled_df))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              0.0
sp_copy_5_percent    sp_copy_5_percent              0.0
sp_copy_10_percent  sp_copy_10_percent              0.0


#### Store the list of disfferences between Org. and Imputed Value

In [386]:
# create list of difference bwtween imputed and orginal value

sp_diff_1 = []
sp_diff_5 = []
sp_diff_10 = []
count = 0

for i in sp_1_idx:
    count +=1
    diff1 = abs(imputed_saleprice_df['sp_copy_1_percent'][i] - imputed_saleprice_df['SalePrice'][i])
    sp_diff_1.append(diff1)
    

for i in sp_5_idx:
    diff5 = abs(imputed_saleprice_df['sp_copy_5_percent'][i] - imputed_saleprice_df['SalePrice'][i])
    sp_diff_5.append(diff5)

for i in sp_10_idx:
    diff10 = abs(imputed_saleprice_df['sp_copy_10_percent'][i] - imputed_saleprice_df['SalePrice'][i])
    sp_diff_10.append(diff10)

In [387]:
print(len(sp_diff_1))
print(len(sp_diff_5))
print(len(sp_diff_10))

10
50
100


In [388]:
# create list of difference bwtween imputed and orginal value

sp_scaled_diff_1 = []
sp_scaled_diff_5 = []
sp_scaled_diff_10 = []
count = 0

for i in sp_1_idx:
    count +=1
    diff1 = abs(imputed_saleprice_scaled_df['sp_copy_1_percent'][i] - imputed_saleprice_scaled_df['SalePrice'][i])
    sp_scaled_diff_1.append(diff1)
    

for i in sp_5_idx:
    diff5 = abs(imputed_saleprice_scaled_df['sp_copy_5_percent'][i] - imputed_saleprice_scaled_df['SalePrice'][i])
    sp_scaled_diff_5.append(diff5)

for i in sp_10_idx:
    diff10 = abs(imputed_saleprice_scaled_df['sp_copy_10_percent'][i] - imputed_saleprice_scaled_df['SalePrice'][i])
    sp_scaled_diff_10.append(diff10)

In [389]:
print(len(sp_scaled_diff_1))
print(len(sp_scaled_diff_5))
print(len(sp_scaled_diff_10))

10
50
100


In [390]:
sp_scaled_diff_1[:5]

[0.0, 0.0, 0.0, 0.0, 0.0]

In [391]:
sp_diff_1[:5]

[10.0, 20.0, 80.0, 220.0, 0.0]

#### Calculate the mean and var of list of diff. KNN - SalePrice

In [392]:
m1 = sum(sp_diff_1) / len(sp_diff_1)

# calculate variance using a list comprehension
var_res1 = sum((xi - m1) ** 2 for xi in sp_diff_1) / len(sp_diff_1)

m5 = sum(sp_diff_5) / len(sp_diff_5)

# calculate variance using a list comprehension
var_res5 = sum((xii - m5) ** 2 for xii in sp_diff_5) / len(sp_diff_5)


m10 = sum(sp_diff_10) / len(sp_diff_10)

# calculate variance using a list comprehension
var_res10 = sum((xiii - m10) ** 2 for xiii in sp_diff_10) / len(sp_diff_10)

print(f"The mean of 1% is {m1} and varience 1% is {var_res1}")
print(f"The mean of 5% is {m5} and varience 5% is {var_res5}")
print(f"The mean of 10% is {m5} and varience 10% is {var_res10}")

The mean of 1% is 170.0 and varience 1% is 42400.0
The mean of 5% is 444.9439999999997 and varience 5% is 2554554.1584639903
The mean of 10% is 444.9439999999997 and varience 10% is 6304766.8341439795


In [393]:
df_knn_saleprice = pd.DataFrame.from_dict({'1%_saleprice': [m1, var_res1],
 '5%_saleprice': [m5, var_res5],
 '10%_saleprice': [m10, var_res10]}, orient='index')
df_knn_saleprice.columns=['diff. list Mean(KNN)', 'diff. list Var.(KNN)']

In [394]:
df_knn_saleprice

Unnamed: 0,diff. list Mean(KNN),diff. list Var.(KNN)
1%_saleprice,170.0,42400.0
5%_saleprice,444.944,2554554.0
10%_saleprice,564.784,6304767.0


#### Calculate the mean and var of list of diff. KNN - SalePrice scaled

In [395]:
m1 = sum(sp_scaled_diff_1) / len(sp_scaled_diff_1)

# calculate variance using a list comprehension
var_res1 = sum((xi - m1) ** 2 for xi in sp_scaled_diff_1) / len(sp_scaled_diff_1)

m5 = sum(sp_scaled_diff_5) / len(sp_scaled_diff_5)

# calculate variance using a list comprehension
var_res5 = sum((xii - m5) ** 2 for xii in sp_scaled_diff_5) / len(sp_scaled_diff_5)


m10 = sum(sp_scaled_diff_10) / len(sp_scaled_diff_10)

# calculate variance using a list comprehension
var_res10 = sum((xiii - m10) ** 2 for xiii in sp_scaled_diff_10) / len(sp_scaled_diff_10)

print(f"The mean of 1% is {m1} and varience 1% is {var_res1}")
print(f"The mean of 5% is {m5} and varience 5% is {var_res5}")
print(f"The mean of 10% is {m5} and varience 10% is {var_res10}")

The mean of 1% is 0.0 and varience 1% is 0.0
The mean of 5% is 2.6301902513541363e-05 and varience 5% is 2.134349753649814e-08
The mean of 10% is 2.6301902513541363e-05 and varience 10% is 1.417383473391258e-08


In [396]:
df_knn_saleprice_scaled = pd.DataFrame.from_dict({'1%_saleprice': [m1, var_res1],
 '5%_saleprice': [m5, var_res5],
 '10%_saleprice': [m10, var_res10]}, orient='index')
df_knn_saleprice_scaled.columns=['diff. list Mean(KNN) scaled', 'diff. list Var.(KNN) scaled']

In [397]:
df_knn_saleprice_scaled

Unnamed: 0,diff. list Mean(KNN) scaled,diff. list Var.(KNN) scaled
1%_saleprice,0.0,0.0
5%_saleprice,2.6e-05,2.13435e-08
10%_saleprice,3.2e-05,1.417383e-08


### Perform MEAN imputation

In [398]:
df_saleprice2 = df_saleprice.copy(deep=True)
df_saleprice_scaled2 = df_saleprice_scaled.copy(deep=True)

In [399]:
print(get_percent_missing(df_saleprice2))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              1.0
sp_copy_5_percent    sp_copy_5_percent              5.0
sp_copy_10_percent  sp_copy_10_percent             10.0


In [400]:
print(get_percent_missing(df_saleprice_scaled2))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              1.0
sp_copy_5_percent    sp_copy_5_percent              5.0
sp_copy_10_percent  sp_copy_10_percent             10.0


#### Impute Mean values in missing for saleprice and saleprice_scaled

In [401]:
df_saleprice2['sp_copy_1_percent'] = df_saleprice2['sp_copy_1_percent'].fillna(df_saleprice2['sp_copy_1_percent'].mean())
df_saleprice2['sp_copy_5_percent'] = df_saleprice2['sp_copy_5_percent'].fillna(df_saleprice2['sp_copy_5_percent'].mean())
df_saleprice2['sp_copy_10_percent'] = df_saleprice2['sp_copy_10_percent'].fillna(df_saleprice2['sp_copy_10_percent'].mean())

In [402]:
df_saleprice_scaled2['sp_copy_1_percent'] = df_saleprice_scaled2['sp_copy_1_percent'].fillna(df_saleprice_scaled2['sp_copy_1_percent'].mean())
df_saleprice_scaled2['sp_copy_5_percent'] = df_saleprice_scaled2['sp_copy_5_percent'].fillna(df_saleprice_scaled2['sp_copy_5_percent'].mean())
df_saleprice_scaled2['sp_copy_10_percent'] = df_saleprice_scaled2['sp_copy_10_percent'].fillna(df_saleprice_scaled2['sp_copy_10_percent'].mean())

#### After MEAN imputation - Saleprice and saleprice scaled

In [403]:
print(get_percent_missing(df_saleprice2))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              0.0
sp_copy_5_percent    sp_copy_5_percent              0.0
sp_copy_10_percent  sp_copy_10_percent              0.0


In [404]:
print(get_percent_missing(df_saleprice_scaled2))

                           column_name  percent_missing
SalePrice                    SalePrice              0.0
sp_copy_1_percent    sp_copy_1_percent              0.0
sp_copy_5_percent    sp_copy_5_percent              0.0
sp_copy_10_percent  sp_copy_10_percent              0.0


In [407]:
df_saleprice2.sample(5)

Unnamed: 0,SalePrice,sp_copy_1_percent,sp_copy_5_percent,sp_copy_10_percent
571,120000,120000.0,120000.0,182343.817778
2,223500,223500.0,223500.0,223500.0
313,375000,375000.0,375000.0,375000.0
377,340000,340000.0,182457.342105,182343.817778
987,395192,395192.0,395192.0,395192.0


In [409]:
df_saleprice_scaled2.sample(5)

Unnamed: 0,SalePrice,sp_copy_1_percent,sp_copy_5_percent,sp_copy_10_percent
216,0.243161,0.243161,0.243161,0.243161
1,0.203583,0.203583,0.203583,0.203583
575,0.116095,0.116095,0.116095,0.116095
397,0.186918,0.186918,0.186918,0.205253
703,0.145952,0.145952,0.145952,0.145952


#### Create List of differences for saleprice and saleprice_scaled Dataframes

In [410]:
# create list of difference bwtween imputed and orginal value

sp_mean_diff_1 = []
sp_mean_diff_5 = []
sp_mean_diff_10 = []
count = 0

for i in sp_1_idx:
    count +=1
    diff1 = abs(df_saleprice2['sp_copy_1_percent'][i] - df_saleprice2['SalePrice'][i])
    sp_mean_diff_1.append(diff1)
    

for i in sp_5_idx:
    diff5 = abs(df_saleprice2['sp_copy_5_percent'][i] - df_saleprice2['SalePrice'][i])
    sp_mean_diff_5.append(diff5)

for i in sp_10_idx:
    diff10 = abs(df_saleprice2['sp_copy_10_percent'][i] - df_saleprice2['SalePrice'][i])
    sp_mean_diff_10.append(diff10)

In [411]:
print(len(sp_mean_diff_1))
print(len(sp_mean_diff_5))
print(len(sp_mean_diff_10))

10
50
100


In [412]:
# create list of difference bwtween imputed and orginal value

sp_scaled_mean_diff_1 = []
sp_scaled_mean_diff_5 = []
sp_scaled_mean_diff_10 = []
count = 0

for i in sp_1_idx:
    count +=1
    diff1 = abs(df_saleprice_scaled2['sp_copy_1_percent'][i] - df_saleprice_scaled2['SalePrice'][i])
    sp_scaled_mean_diff_1.append(diff1)
    

for i in sp_5_idx:
    diff5 = abs(df_saleprice_scaled2['sp_copy_5_percent'][i] - df_saleprice_scaled2['SalePrice'][i])
    sp_scaled_mean_diff_5.append(diff5)

for i in sp_10_idx:
    diff10 = abs(df_saleprice_scaled2['sp_copy_10_percent'][i] - df_saleprice_scaled2['SalePrice'][i])
    sp_scaled_mean_diff_10.append(diff10)

In [413]:
print(len(sp_scaled_mean_diff_1))
print(len(sp_scaled_mean_diff_5))
print(len(sp_scaled_mean_diff_10))

10
50
100


#### Calculate mean and var of list of diff. - MEAN impute SalePrice

In [414]:
m1 = sum(sp_mean_diff_1) / len(sp_mean_diff_1)

# calculate variance using a list comprehension
var_res1 = sum((xi - m1) ** 2 for xi in sp_mean_diff_1) / len(sp_mean_diff_1)

m5 = sum(sp_mean_diff_5) / len(sp_mean_diff_5)

# calculate variance using a list comprehension
var_res5 = sum((xii - m5) ** 2 for xii in sp_mean_diff_5) / len(sp_mean_diff_5)


m10 = sum(sp_mean_diff_10) / len(sp_mean_diff_10)

# calculate variance using a list comprehension
var_res10 = sum((xiii - m10) ** 2 for xiii in sp_mean_diff_10) / len(sp_mean_diff_10)

print(f"The mean of 1% is {m1} and varience 1% is {var_res1}")
print(f"The mean of 5% is {m5} and varience 5% is {var_res5}")
print(f"The mean of 10% is {m5} and varience 10% is {var_res10}")

The mean of 1% is 55971.63676767676 and varience 1% is 1103367192.190047
The mean of 5% is 58478.24210526314 and varience 5% is 3139731297.2794733
The mean of 10% is 58478.24210526314 and varience 10% is 3846674638.263318


In [415]:
df_mean_saleprice = pd.DataFrame.from_dict({'1%_saleprice': [m1, var_res1],
 '5%_saleprice': [m5, var_res5],
 '10%_saleprice': [m10, var_res10]}, orient='index')
df_mean_saleprice.columns=['diff. list Mean(MI)', 'diff. list Var.(MI)']

In [416]:
df_mean_saleprice

Unnamed: 0,diff. list Mean(MI),diff. list Var.(MI)
1%_saleprice,55971.636768,1103367000.0
5%_saleprice,58478.242105,3139731000.0
10%_saleprice,61028.709911,3846675000.0


#### Calculate mean and var of list of diff. - MEAN impute SalePrice scaled

In [417]:
m1 = sum(sp_scaled_mean_diff_1) / len(sp_scaled_mean_diff_1)

# calculate variance using a list comprehension
var_res1 = sum((xi - m1) ** 2 for xi in sp_scaled_mean_diff_1) / len(sp_scaled_mean_diff_1)

m5 = sum(sp_scaled_mean_diff_5) / len(sp_scaled_mean_diff_5)

# calculate variance using a list comprehension
var_res5 = sum((xii - m5) ** 2 for xii in sp_scaled_mean_diff_5) / len(sp_scaled_mean_diff_5)


m10 = sum(sp_scaled_mean_diff_10) / len(sp_scaled_mean_diff_10)

# calculate variance using a list comprehension
var_res10 = sum((xiii - m10) ** 2 for xiii in sp_scaled_mean_diff_10) / len(sp_scaled_mean_diff_10)

print(f"The mean of 1% is {m1} and varience 1% is {var_res1}")
print(f"The mean of 5% is {m5} and varience 5% is {var_res5}")
print(f"The mean of 10% is {m5} and varience 10% is {var_res10}")

The mean of 1% is 0.0 and varience 1% is 0.0
The mean of 5% is 0.00893610697344667 and varience 5% is 0.0014044730755095036
The mean of 10% is 0.00893610697344667 and varience 10% is 0.0004431848362889144


In [418]:
df_mean_saleprice_scaled = pd.DataFrame.from_dict({'1%_saleprice_scaled': [m1, var_res1],
 '5%_saleprice_scaled': [m5, var_res5],
 '10%_saleprice_scaled': [m10, var_res10]}, orient='index')
df_mean_saleprice_scaled.columns=['diff. list Mean(MI) scaled', 'diff. list Var.(MI) scaled']

In [419]:
df_mean_saleprice_scaled

Unnamed: 0,diff. list Mean(MI) scaled,diff. list Var.(MI) scaled
1%_saleprice_scaled,0.0,0.0
5%_saleprice_scaled,0.008936,0.001404
10%_saleprice_scaled,0.007492,0.000443


# 2.2 Housing Data Results - KNN and MEAN

In [420]:
results1 = pd.concat([df_knn_saleprice, df_knn_saleprice_scaled, df_mean_saleprice, df_mean_saleprice_scaled])

In [421]:
results1

Unnamed: 0,diff. list Mean(KNN),diff. list Var.(KNN),diff. list Mean(KNN) scaled,diff. list Var.(KNN) scaled,diff. list Mean(MI),diff. list Var.(MI),diff. list Mean(MI) scaled,diff. list Var.(MI) scaled
1%_saleprice,170.0,42400.0,,,,,,
5%_saleprice,444.944,2554554.0,,,,,,
10%_saleprice,564.784,6304767.0,,,,,,
1%_saleprice,,,0.0,0.0,,,,
5%_saleprice,,,2.6e-05,2.13435e-08,,,,
10%_saleprice,,,3.2e-05,1.417383e-08,,,,
1%_saleprice,,,,,55971.636768,1103367000.0,,
5%_saleprice,,,,,58478.242105,3139731000.0,,
10%_saleprice,,,,,61028.709911,3846675000.0,,
1%_saleprice_scaled,,,,,,,0.0,0.0


In [422]:
results1.to_csv('housing_data_saleprice_KNN_Mean_results.csv')