In [38]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn_pandas.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from haversine import haversine
from sklearn.linear_model import LinearRegression

In [65]:
#load data
df = pd.read_csv("kc_house_data.csv")

In [78]:
#train-test split
y = np.log(df['price'])
X = df.drop(columns=['price', 'id', 'date'])
X['log_sqft_living']= np.log(X['sqft_living'].astype(np.float32))
X['sqft_lot']= X['sqft_lot'].astype(np.float32)
X = X.drop(columns=['sqft_living','sqft_lot'])
X['distance_from_billgates']=X.apply(lambda row: dist_from_bill(row['lat'], row['long']), axis=1)
X = X.drop(columns=['sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 
                    'sqft_living15', 'sqft_lot15', 'zipcode'])
# X = X.drop(columns=['waterfront'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=30)

In [79]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17277 entries, 11632 to 5925
Data columns (total 9 columns):
bedrooms                   17277 non-null int64
bathrooms                  17277 non-null float64
floors                     17277 non-null float64
waterfront                 15333 non-null float64
view                       17233 non-null float64
condition                  17277 non-null int64
grade                      17277 non-null int64
log_sqft_living            17277 non-null float32
distance_from_billgates    17277 non-null float64
dtypes: float32(1), float64(5), int64(3)
memory usage: 1.3 MB


In [80]:
# Create transformers

class CategoryCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        pass
    
    def transform(self, X):
        return np.array([(value if value in (1, 2, 3, 4) else 0) for value in X],
                       dtype=np.uint8)

class IntConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        pass
    
    def transform(self, X):
        return X.astype(np.uint8)

In [81]:
mapper = DataFrameMapper([
    (['log_sqft_living'], StandardScaler()),
    (['distance_from_billgates'], StandardScaler()),
    (['condition'], StandardScaler()),
    (['grade'], StandardScaler()),
    (['floors'], StandardScaler()),
   # (['zipcode'], OneHotEncoder(categories='auto', dtype=np.uint8)),
#     (['floors'], OneHotEncoder(categories='auto', dtype=np.uint8)),
    (['waterfront'], CategoryCleaner()),
    (['view'], CategoryCleaner()),
    (['bedrooms'], IntConverter()),
    (['bathrooms'], IntConverter())
    ], df_out=True)

mapper.fit(X_train)
X_train_clean = mapper.transform(X_train)
X_test_clean = mapper.transform(X_test)

In [82]:
lr=LinearRegression()
lr.fit(X_train_clean,y_train)
y_pred=lr.predict(X_test_clean)

In [83]:
lr.score(X_train_clean, y_train)

0.7711365753812796

In [84]:
lr.score(X_test_clean, y_test)

0.764018477563301

In [85]:
coef = lr.coef_.tolist()
zip_coef = list(zip(coef, X_train_clean.columns))
zip_coef.sort()
zip_coef

[(-0.22936903113659712, 'distance_from_billgates'),
 (-0.04005897963199018, 'bedrooms'),
 (0.006982293549336568, 'floors'),
 (0.024835032368500234, 'bathrooms'),
 (0.04679076884408008, 'condition'),
 (0.07688427234907279, 'view'),
 (0.1754086539114694, 'grade'),
 (0.21376423748935275, 'log_sqft_living'),
 (0.40233928795086515, 'waterfront')]

In [None]:
# transform data

In [20]:
def dist_from_bill(lat, long):
    """returns a km distance from Bill and Melinda Gates' Medina residence"""
    gates_lat = 47.62774
    gates_long = -122.24194
    return haversine((lat, long), (gates_lat, gates_long))

In [23]:
dist_from_bill(1,1)

12329.845149186951