In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('kc_house_data.csv')

### Deal with Null Values

**Replace Null Values with Median WF value for the zipcode**

In [3]:
df['waterfront'] = df['waterfront'].fillna(df.groupby('zipcode')['waterfront'].transform('median'))

In [4]:
df['waterfront'].value_counts()

0.0    21451
1.0      146
Name: waterfront, dtype: int64

**Merge Year Built & Year Renovated into 1 Column**

In [5]:
df['yr_of_last_update'] = df['yr_renovated']
df['yr_of_last_update'].fillna(value=df['yr_built'], inplace=True)
df['yr_of_last_update'].replace(to_replace=0, value=df['yr_built'], inplace=True)
df['yr_of_last_update'] = df['yr_of_last_update'].astype(int)

**Replace sqft_basement ?s with median by zipcode**

In [6]:
#replace sqft_basement '?' with nas
df['sqft_basement'] = df['sqft_basement'].replace('?', np.nan)

#Convert to int
df['sqft_basement'] = df['sqft_basement'].astype(float)

In [7]:
#Replace Nas with median values by zipcode
df['sqft_basement'] = df['sqft_basement'].fillna(df.groupby('zipcode')['sqft_basement'].transform('median'))

In [8]:
# Replace null view values
df['view'] = df['view'].fillna(df.groupby('zipcode')['view'].transform('median'))

## Create Season Bins

In [9]:
#Using Northern Meterological Seasons
#https://www.timeanddate.com/calendar/aboutseasons.html
df['date'] = pd.to_datetime(df['date'])
df['season_sold'] = df['date'].apply(lambda x: 'Spring' if (3 <= x.month <= 5) else 'Summer' if (6 <= x.month <= 8) else 'Fall' if (9 <= x.month <= 11) else 'Winter')


## Create Above/Below Ratio

In [10]:
df['below_ground_ratio'] =  df['sqft_basement'] / df['sqft_above']

## Remove Date, Above, Basement

In [11]:
df = df.drop(['date', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'], axis=1)

## Remove Bedrooms Outlier

In [12]:
to_drop = df.loc[(df['bedrooms'] == 33)]
df.drop(to_drop.index, axis = 0, inplace = True)
df.shape

(21596, 19)

## Remove Houses Over 1 Million or Below 100 Thousand

In [13]:
to_drop = df.loc[(df['price'] < 100000)]
df.drop(to_drop.index, axis = 0, inplace = True)
to_drop = df.loc[(df['price'] > 2000000)]
df.drop(to_drop.index, axis = 0, inplace = True)
df.shape

(21374, 19)

## Change Price Units to Hundreds of Thousands

In [14]:
df['price_100k'] = df['price'] / 100000

In [15]:
df = df.drop(['price', 'sqft_living15', 'sqft_lot15', 'view'], axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21374 entries, 0 to 21596
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  21374 non-null  int64  
 1   bedrooms            21374 non-null  int64  
 2   bathrooms           21374 non-null  float64
 3   sqft_living         21374 non-null  int64  
 4   sqft_lot            21374 non-null  int64  
 5   floors              21374 non-null  float64
 6   waterfront          21374 non-null  float64
 7   condition           21374 non-null  int64  
 8   grade               21374 non-null  int64  
 9   zipcode             21374 non-null  int64  
 10  lat                 21374 non-null  float64
 11  long                21374 non-null  float64
 12  yr_of_last_update   21374 non-null  int64  
 13  season_sold         21374 non-null  object 
 14  below_ground_ratio  21374 non-null  float64
 15  price_100k          21374 non-null  float64
dtypes: f

In [18]:
df.to_csv(path_or_buf='Master_Data_Table.csv')