In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

### Reading Data/Splitting

In [46]:
# Read the data
df = pd.read_csv('../Data/rawhousing.csv')

In [47]:
# Splitting
test = df.sample(frac = .15, random_state=100)
train = df.drop(index=test.index)

### Processing

One-hot encoding non-numeric features

In [48]:
def oneHot(df):
    data = pd.get_dummies(df)
    data = data.drop_duplicates()

    return data

Imputing missing values

In [49]:
def impute(df):
    steps = [('impute', SimpleImputer(missing_values=np.nan, strategy='mean'))]
    pipe = Pipeline(steps=steps)

    pipe.fit(df)
    transformed = pd.DataFrame(pipe.transform(df), columns = df.columns)
    return transformed

In [50]:
test = oneHot(test)
train = oneHot(train)
df = oneHot(train)

test = impute(test)
train = impute(train)
df = oneHot(test)


Scaling features with high variance between values

In [51]:
scaledCols = ["longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income", "median_house_value"]

ct = ColumnTransformer([("scale", StandardScaler(), scaledCols)],remainder='passthrough')
ct.fit(train)
train =  pd.DataFrame(ct.transform(train), columns = train.columns)
test =  pd.DataFrame(ct.transform(test), columns = test.columns)

ct.fit(df)

df = pd.DataFrame(ct.transform(df), columns = df.columns)

### Display and Export

In [52]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.321548,1.049203,0.979649,-0.806566,-0.973528,-0.967841,-0.975642,2.349835,2.127787,0.0,0.0,0.0,1.0,0.0
1,-1.316569,1.039856,-0.605879,2.03772,1.338418,0.84803,1.653329,2.337291,1.312093,0.0,0.0,0.0,1.0,0.0
2,-1.331505,1.035183,1.851689,-0.626368,-0.722692,-0.76171,-0.734047,0.936241,1.162997,0.0,0.0,0.0,1.0,0.0
3,-1.331505,1.035183,1.851689,-0.464922,-0.616205,-0.755596,-0.630135,-0.010811,1.170798,0.0,0.0,0.0,1.0,0.0
4,-1.331505,1.035183,1.851689,-0.788729,-0.774752,-0.888358,-0.80159,0.089644,0.542341,0.0,0.0,0.0,1.0,0.0


In [53]:
test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.734588,-0.852819,0.583267,-0.700917,-0.843377,-0.861282,-0.843155,-0.086231,0.330832,1.0,0.0,0.0,0.0,0.0
1,-1.341462,1.021163,1.851689,-0.46355,-0.199724,-0.234156,-0.263846,-1.380449,-0.892276,0.0,0.0,0.0,1.0,0.0
2,0.226777,-0.123789,-1.24009,0.861405,0.408434,0.646267,0.440158,0.505907,-0.528204,0.0,1.0,0.0,0.0,0.0
3,-1.779573,1.717481,-0.764432,-0.739792,-0.793683,-0.87875,-0.908099,-0.518305,-0.458857,0.0,1.0,0.0,0.0,0.0
4,-1.465926,1.095935,0.186885,1.788919,2.438782,1.338027,2.500211,-0.514616,1.014767,0.0,0.0,0.0,1.0,0.0


In [34]:
train.to_csv("../Data/traincleaned.csv", index=False)
test.to_csv("../Data/testcleaned.csv", index=False)
df.to_csv("../Data/housingcleaned.csv", index=False)

In [35]:
df.count()

longitude                     464
latitude                      464
housing_median_age            464
total_rooms                   464
total_bedrooms                464
population                    464
households                    464
median_income                 464
median_house_value            464
ocean_proximity_<1H OCEAN     464
ocean_proximity_INLAND        464
ocean_proximity_NEAR BAY      464
ocean_proximity_NEAR OCEAN    464
dtype: int64