In [1]:
#Import Initial Libraries and Data Set
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('data/kc_house_data.csv')

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [3]:
#Convert NA values in 'waterfront' to 0

data['view'] = data['view'].replace(np.nan, 0)

#Convert NA values in 'waterfront' to 0

data['waterfront'] = data['waterfront'].replace(np.nan, 0)

#Convert NA values in 'yr_renovated' to 0

data['yr_renovated'] = data['yr_renovated'].replace(np.nan, 0)

In [4]:
data.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     21597 non-null  float64
 9   view           21597 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   21597 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [6]:
#Convert zipcode to a str

data['zipcode_str'] = data['zipcode'].apply(lambda x: str(x))

In [7]:
#Restructure yr_built into a continuous variable called 'age'

data['age'] = data['yr_built'].apply(lambda x: 2021 - x)

In [8]:
#create 'ever_refurbished' to reflect whether or not a house has even been refurbished

data['refurbished'] = np.where((data['yr_renovated'] > 0),1,0)

In [9]:
#Establish new Cont and Cat data

cont_data = ['price',
 'sqft_living',
 'sqft_lot',
 'grade',
 'sqft_above',
 'sqft_living15',
 'sqft_lot15',
 'age']

cat_data = [
 'bedrooms',
 'bathrooms',
 'floors',
 'waterfront',
 'condition',
 'zipcode_str',
 'refurbished', 'view']

features = cont_data + cat_data

features

['price',
 'sqft_living',
 'sqft_lot',
 'grade',
 'sqft_above',
 'sqft_living15',
 'sqft_lot15',
 'age',
 'bedrooms',
 'bathrooms',
 'floors',
 'waterfront',
 'condition',
 'zipcode_str',
 'refurbished',
 'view']

In [63]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = StandardScaler()

core_data = data[features]

core_data = pd.get_dummies(core_data)

columns = core_data.columns

#normalize all cont data

core_cont_data = np.log(core_data[cont_data])

core_cont_data

#isolate rest of cat data

core_cat_features = []

for item in core_data.columns:
    if item not in core_cont_data.columns:
        core_cat_features.append(item)
        
core_cat_features

#rejoin cont and cat data

features_df = pd.concat([core_cont_data, core_data[core_cat_features]], axis = 1)
                         
features_df

#scale all data

features_df = pd.DataFrame(scaler.fit_transform(features_df), columns = features_df.columns)

features_df

X = features_df.drop('price', axis = 1)
y = features_df['price']

final_regression = LinearRegression()

final_regression.fit(X, y)

coefs_dict = dict(zip(X.columns, linreg.coef_))

coefs_dict

final_regression.score(X, y)

0.881803663339833

In [65]:
#Train Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

train_model = LinearRegression()

train_model.fit(Xtrain, y_tra)