In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats.mstats import winsorize
import geopandas as gpd

In [2]:
df = pd.read_csv("Data\synthetic_2021_HH.csv")

In [12]:
df = df[~((df["dwelltype"] == "Other") | (df["dwelltype"] == "Missing"))]
df = df[df["owndwell"] != "Something Else"]
df

Unnamed: 0,totalvehs,hhsize,dwelltype,owndwell,hhinc,SA1
0,2.0,6.0,Separate House,Being Purchased,3625.0,20301103401
1,1.0,7.0,Separate House,Being Purchased,1825.0,20301103401
2,2.0,7.0,Terrace/Townhouse,Being Purchased,1375.0,20301103401
3,2.0,7.0,Separate House,Being Purchased,100.0,20301103401
4,3.0,7.0,Separate House,Being Purchased,900.0,20301103401
...,...,...,...,...,...,...
1902678,0.0,1.0,Flat or Apartment,Being Purchased,1750.0,21402159223
1902680,2.0,1.0,Separate House,Fully Owned,3200.0,21402159223
1902681,1.0,1.0,Separate House,Being Purchased,2800.0,21402159223
1902682,0.0,1.0,Flat or Apartment,Being Rented,3000.0,21402159223


## Winsorizing Data

In [10]:
# Redistributing bottom and top 1% to reduce effect of outliers
df_winsorize = df.copy()
df_winsorize['HHSIZE'] = winsorize(df['hhsize'], limits=[0.01, 0.01])
df_winsorize['TOTALVEHS'] = winsorize(df['totalvehs'], limits=[0.01, 0.01])
df_winsorize['INC'] = winsorize(df['hhinc'], limits=[0.01, 0.01])

## Normalise Data

In [6]:
df_normalise= df_winsorize.copy()
cols = ['HHSIZE', 'TOTALVEHS']
for col in cols:
    df_normalise[f'{col}_normalised'] = (df_normalise[col] - df_normalise[col].min()) / (df_normalise[col].max() - df_normalise[col].min())
    qt = QuantileTransformer(output_distribution='uniform')

df_normalise['INC_normalised'] = qt.fit_transform(df_normalise[['INC']])

## One Hot Encoding

In [7]:
# Work with categorical data by using one hot encoding
df_onehot = df_normalise.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['dwelltype', 'owndwell'])

In [8]:
df_onehot = df_onehot.drop(columns=["totalvehs", "hhsize","hhinc",])


In [13]:
df_onehot.to_csv('Data\clean_data.csv', index=False)