In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

df = pd.read_csv('data/kc_house_data.csv')

# make subsets of the columns
num_cont = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_garage', 'sqft_patio']
num_disc = ['price', 'bedrooms', 'bathrooms', 'floors', 'yr_built', 'yr_renovated']
cat = ['price', 'waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade', 'heat_source']
ignore = ['id', 'date', 'lat', 'long', 'address']

# create sub-dfs and standardize numeric values
df_cont_std = df[num_cont].copy()
for col in df_cont_std:
    df_cont_std[col] = (df_cont_std[col] - df_cont_std[col].mean()) / df_cont_std[col].std()

df_disc_std = df[num_disc].copy()
for col in df_disc_std:
    df_disc_std[col] = (df_disc_std[col] - df_disc_std[col].mean()) / df_disc_std[col].std()

df_cat = df[cat].copy()

### check multicollinearity before modifying data

In [None]:
# save absolute value of correlation matrix as a data frame
# converts all values to absolute value
# stacks the row:column pairs into a multindex
# reset the index to set the multindex to seperate columns
# sort values. 0 is the column automatically generated by the stacking

df_mc=df.corr().abs().stack().reset_index().sort_values(0, ascending=False)

# zip the variable name columns (Which were only named level_0 and level_1 by default) in a new column named "pairs"
df_mc['pairs'] = list(zip(df_mc.level_0, df_mc.level_1))

# set index to pairs
df_mc.set_index(['pairs'], inplace = True)

# drop level columns
df_mc.drop(columns=['level_1', 'level_0'], inplace = True)

# rename correlation column as cc rather than 0
df_mc.columns = ['cc']

# drop duplicates. This could be dangerous if you have variables perfectly correlated with variables other than themse
# for the sake of exercise, kept it in.
df_mc.drop_duplicates(inplace=True)

In [None]:
df_mc[(df_mc.cc > .55) & (df_mc.cc < 1)]

### build the dataframe based on info from cleaning

In [None]:
# collect a list of outlier data past a certain threshold of standard deviations

threshold = 7.5
outliers = set()
for col in df_cont_std:
    outliers = outliers.union(set(df_cont_std[df_cont_std[col] > threshold].index))
    
len(outliers)

In [None]:
df['garage'] = df['sqft_garage'] > 0

In [None]:
df['renovated'] = df['yr_renovated'] > 0

In [None]:
df = df.drop(df[ignore], axis=1)

In [None]:
df = pd.get_dummies(df, columns=['waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade', 'heat_source'])
df = df.drop(['waterfront_NO', 'greenbelt_NO', 'nuisance_NO', 'view_NONE', 'condition_Average', 'grade_7 Average',
        'heat_source_Other'], axis=1)

### test for multicollinearity

In [None]:
# save absolute value of correlation matrix as a data frame
# converts all values to absolute value
# stacks the row:column pairs into a multindex
# reset the index to set the multindex to seperate columns
# sort values. 0 is the column automatically generated by the stacking

df_mc=df.corr().abs().stack().reset_index().sort_values(0, ascending=False)

# zip the variable name columns (Which were only named level_0 and level_1 by default) in a new column named "pairs"
df_mc['pairs'] = list(zip(df_mc.level_0, df_mc.level_1))

# set index to pairs
df_mc.set_index(['pairs'], inplace = True)

# drop level columns
df_mc.drop(columns=['level_1', 'level_0'], inplace = True)

# rename correlation column as cc rather than 0
df_mc.columns = ['cc']

# drop duplicates. This could be dangerous if you have variables perfectly correlated with variables other than themse
# for the sake of exercise, kept it in.
df_mc.drop_duplicates(inplace=True)

In [None]:
df_mc[(df_mc.cc > .55) & (df_mc.cc < 1)]

### notes on multicollinearity

RENOVATED: Not sure what's better, to keep the binary or the year (but the year is wrong because it has zeros?) Maybe make a YEARS SINCE RENOVATED column.

ABOVE v. LIVING: We see below that LIVING is strongly collinear with PRICE, which means we definitely want to keep LIVING. Probably this means we can delete ABOVE.

GARAGE: Keep the binary or the area?

BATHROOMS: Strongly collinear with LIVING, so we can probably delete BATHROOMS.

HEAT SOURCES: These are the two most popular ones. I guess if it's not one, it's the other? Not sure whether to delete either here.

BATHROOMS v. ABOVE: As discussed, probably drop both.

BEDROOMS v. LIVING: Number isn't as high here, so maybe keep BEDROOMS.

BATHROOMS v. BEDROOMS: Already going to kill BATHROOMS, so maybe keep BEDROOMS.

ABOVE v. GARAGE: (?) Number isn't high, but if we get rid of GARAGE AREA and keep the binary this would go away.

**FINAL WORD**:
- do *something* about RENOVATED
- keep the GARAGE binary
- drop ABOVE and BATHROOMS

### explore logs

In [None]:
y = df['price']
X = df_cont_std.drop(columns=['price'], axis=1)

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15,8), sharey=True)

for i, column in enumerate(X.columns):
    # Locate applicable axes
    row = i // 3
    col = i % 3
    ax = axes[row][col]

    # Plot feature vs. y and label axes
    ax.scatter(X[column], y, alpha=0.2)
    ax.set_xlabel(column)
    if col == 0:
        ax.set_ylabel("SalePrice")

fig.tight_layout()

In [None]:
candidates = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_garage', 'sqft_patio']

fig, axes = plt.subplots(ncols=2, nrows=len(candidates), figsize=(8,20))

for i, column in enumerate(candidates):
    # Plot raw version
    left_ax = axes[i][0]
    left_ax.scatter(df[column], y, alpha=0.5)
    left_ax.set_xlabel(column)
    left_ax.set_ylabel('price')
    
    # Plot log transformed version
    right_ax = axes[i][1]
    right_ax.scatter(np.log(df[column]), np.log(y), alpha=0.5)
    right_ax.set_xlabel(f"log({column})")
    right_ax.set_ylabel("log(price)")
    
fig.suptitle("Raw vs. Log Transformed")

fig.tight_layout();

In [2]:
df_log_cont = df.copy()

for col in num_cont:
    if col != 'price':
        df_log[col] = np.log(df_log[col])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:
df_log_target = df.copy()

df_log_target['price'] = np.log(df_log_target['price'])

In [3]:
df.corr()['price']

id              -0.034184
price            1.000000
bedrooms         0.289204
bathrooms        0.480401
sqft_living      0.608521
sqft_lot         0.085730
floors           0.180576
sqft_above       0.538651
sqft_basement    0.245058
sqft_garage      0.264169
sqft_patio       0.313409
yr_built         0.096013
yr_renovated     0.084786
lat              0.063632
long            -0.022509
Name: price, dtype: float64

In [4]:
df_log.corr()['price']

id              -0.034184
price            1.000000
bedrooms         0.289204
bathrooms        0.480401
sqft_living      0.514532
sqft_lot         0.193742
floors           0.180576
sqft_above       0.466467
sqft_basement    0.271570
sqft_garage      0.312543
sqft_patio       0.237249
yr_built         0.096013
yr_renovated     0.084786
lat              0.063632
long            -0.022509
Name: price, dtype: float64

In [7]:
df_log_target.corr()['price']

id              -0.024276
price            1.000000
bedrooms         0.346268
bathrooms        0.516799
sqft_living      0.621576
sqft_lot         0.083969
floors           0.234028
sqft_above       0.547800
sqft_basement    0.250714
sqft_garage      0.285286
sqft_patio       0.309753
yr_built         0.120731
yr_renovated     0.076076
lat              0.074283
long            -0.018707
Name: price, dtype: float64

### notes on logs

Taking log(PRICE) improves correlation for just about everything. Separately, LOT, BASEMENT, and GARAGE seem to improve correlation when they are logged.