This notebook will apply a general linear regression model to the data

In [1]:
import numpy as np
import pandas as pd
import pickle
import plotly.express as px

In [2]:
raw_df = pd.read_pickle("dataset.pickle")

In [90]:
# preprocessing


raw_df = raw_df.reset_index(drop=True)
df = pd.DataFrame()
df['price'] = raw_df['price']
df['date'] = raw_df['date'].astype(int)/1e9
df['latitude'] = raw_df['latitude']
df['longitude'] = raw_df['longitude']
df['area'] = raw_df['area']
df['bedrooms'] = raw_df['bedrooms']
df['pets'] = raw_df['pets']
df['furnished'] = raw_df['furnished']
df['unit_type'] = raw_df['unit_type']

# categorical variable one-hot encoded
df['unit_type'] = pd.Categorical(df['unit_type'])
dfDummies = pd.get_dummies(df['unit_type'], prefix = 'unit_type_')
df = pd.concat([df, dfDummies], axis=1)
df = df.drop(columns='unit_type')

# one-hot encoding using sklearn
# from sklearn import preprocessing
# from sklearn.impute import KNNImputer
# enc = preprocessing.OneHotEncoder()
# nan = np.nan
# #df['unit_type'] = pd.Categorical(df['unit_type'])
# df['unit_type'] = df['unit_type'].replace({None:"None"})
# X = df['unit_type'].to_numpy().reshape(-1,1)
# X = np.concatenate((df.drop(columns='unit_type').to_numpy(),enc.fit_transform(X).toarray()),axis=1) # this has a bug
# imputer = KNNImputer(n_neighbors=10, weights="uniform")
# X = imputer.fit_transform(X)


X = df.drop(columns='price').to_numpy()
y = df['price'].to_numpy().reshape([-1,1])

print("X Columns: {}",format(df.columns))
print(X.shape)
print(y.shape)

X Columns: {} Index(['price', 'date', 'latitude', 'longitude', 'area', 'bedrooms', 'pets',
       'furnished', 'unit_type__apartment', 'unit_type__condo',
       'unit_type__house', 'unit_type__townhouse'],
      dtype='object')
(306812, 11)
(306812, 1)


In [91]:
# scale the dataset

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# # Load the diabetes dataset
# diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [[ 2.24153119e-06  2.38223509e+03 -1.53534785e+03  8.35046473e-01
   2.44765622e+02  2.00927935e+02  2.00927935e+02 -1.89584082e+01
   3.46131802e+02 -4.44499266e+02  1.17325873e+02]]
Mean squared error: 383712.94
Coefficient of determination: 0.65


In [95]:
# visualize results
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
from sklearn import linear_model, datasets

# diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# regr = linear_model.LinearRegression()

## Train the model using the training sets
# regr.fit(diabetes_X, diabetes_y)

## Make predictions using the testing set
# y_pred = regr.predict(diabetes_X)

# plot_lin_reg(diabetes_X,diabetes_y,regr)

def plot_lin_reg (X,y,regr,x_columns=[],y_columns=[]):
    # put stuff into a df
    data = np.concatenate((X,y),axis=1)
    x_columns = x_columns or list(map(lambda x: "x_" + str(x),range(X.shape[1])))
    y_columns = y_columns or list(map(lambda x: "y_" + str(x),range(y.shape[1])))
    df = pd.DataFrame(data=data,columns=x_columns+y_columns)
    
    # make predictions
    y_pred = regr.predict(X)
    df['y_pred'] = y_pred
    
    num_columns = len(x_columns)
    grid_size = int(np.ceil(np.sqrt(num_columns)))
    assert grid_size <= 4, "Too many variables to plot"
    fig = make_subplots(rows=grid_size,cols=grid_size)
    row = 1
    col = 1
    y_name = y_columns[0]
    
    for i,col_name in enumerate(x_columns):
    
        # control the data for all the other variables than the one we are looking at
        X_control = np.copy(X)
        X_control[:,i] = np.zeros(X.shape[0]) # set the variable to zeros
        control = regr.predict(X_control)# - regr.intercept_
        y_control = y - control # subtract off the effects of the other variables
        df['y_control'] = y_control
    
        # plot the line for just the active variable
        X_regr = np.zeros(X.shape)
        X_regr[:,i] = X[:,i]
        y_regr = regr.predict(X_regr) - regr.intercept_[0]
        df['y_regr'] = y_regr 
    
        print("{}: y = {:3f}*{}".format(col_name,regr.coef_[0][i],col_name))
    
        # residuals
    
        residuals = y_regr - y_control
        df['residuals'] = residuals
    
        # finally, let's plot it all

        plot_data = df.sample(n=1000,axis=0) if len(df) > 1000 else df # if there are more than 1,000 points, just use a sample
        plot_data = plot_data.sort_values(by=col_name)
    
    
        #fig.add_trace(go.Scatter(x=plot_data[col_name],y=plot_data['price'],name='Real Price',mode='markers',visible='legendonly'),row=row,col=col)
        fig.add_trace(go.Scatter(x=plot_data[col_name],y=plot_data['y_control'],mode='markers',name='{} (controlled)'.format(y_name)),row=row,col=col)
        fig.add_trace(go.Scatter(x=plot_data[col_name],y=plot_data['y_regr'],mode='lines',name='Linear fit'),row=row,col=col)
        #fig.add_trace(go.Scatter(x=plot_data['area'],y=plot_data['residuals'],mode='markers',name='Residuals'),row=2,col=1)
        #fig.update_yaxes(title_text="Price ($)", row=row, col=col)
        fig.update_xaxes(title_text=col_name, row=row, col=col)
        
        #fig.update_yaxes(title_text="Price ($)", row=2, col=1)
        
        if row < grid_size:
            row = row + 1
        else:
            row = 1
            col = col + 1
            
    fig.update_layout(title='Linear Model Summary')
    fig.update_layout(showlegend=False)
    fig.show(renderer='browser')

col_names = list(df.columns)
col_names.remove('price')
plot_lin_reg(X_test,y_test,regr,x_columns=col_names,y_columns=['price'])

date: y = 0.000002*date
latitude: y = 2382.235085*latitude
longitude: y = -1535.347846*longitude
area: y = 0.835046*area
bedrooms: y = 244.765622*bedrooms
pets: y = 200.927935*pets
furnished: y = 200.927935*furnished
unit_type__apartment: y = -18.958408*unit_type__apartment
unit_type__condo: y = 346.131802*unit_type__condo
unit_type__house: y = -444.499266*unit_type__house
unit_type__townhouse: y = 117.325873*unit_type__townhouse
