Build a regression model.

In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

import sqlite3
from sqlite3 import Error

def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

cnx = create_connection('../db/data.db')
data_df = pd.read_sql_query('SELECT poi.*, stations.free_bikes FROM poi INNER JOIN stations on stations.stationId = poi.stationId', cnx)
data_df.head()

Connection to SQLite DB successful


Unnamed: 0,pointId,review_count,distance,rating,price,name,stationId,free_bikes
0,0,4.0,350.299502,10.0,2.02277,Elefanté,642,7
1,1,11.0,231.18153,10.0,2.02277,Resto Keela,721,3
2,2,7.0,443.975597,10.0,2.02277,Mama Khan,736,11
3,3,1.0,1195.394136,10.0,2.02277,Marché Al Amine,328,11
4,4,1.0,656.954672,10.0,2.02277,Bar Fullum,219,7


In [104]:
training = data_df[['review_count', 'distance','rating', 'price', 'free_bikes', 'stationId']].groupby('stationId').mean()
# training['stationId'] = training.index
training.head()

Unnamed: 0_level_0,review_count,distance,rating,price,free_bikes
stationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,26.87387,766.990127,7.757885,2.074531,0.0
1,148.220371,290.300182,7.814286,2.08444,8.0
2,19.760318,917.950983,7.421037,1.94321,3.0
3,31.663494,125.035372,7.470137,1.87666,8.0
4,27.022635,445.110529,7.762293,2.01518,7.0


In [105]:
counts = data_df[['review_count', 'distance','rating', 'price', 'free_bikes', 'stationId']].groupby('stationId').count()['free_bikes']
# training['count'] = data_df[['review_count', 'distance','rating', 'price', 'free_bikes', 'stationId']].groupby('stationId').transform('count')
# training.head()
training = training.merge(counts, on='stationId', how='left').rename(columns={'free_bikes_x': 'free_bikes', 'free_bikes_y': 'count'})
training.head()

Unnamed: 0_level_0,review_count,distance,rating,price,free_bikes,count
stationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.87387,766.990127,7.757885,2.074531,0.0,18
1,148.220371,290.300182,7.814286,2.08444,8.0,14
2,19.760318,917.950983,7.421037,1.94321,3.0,14
3,31.663494,125.035372,7.470137,1.87666,8.0,7
4,27.022635,445.110529,7.762293,2.01518,7.0,9


In [97]:
# training.skew().sort_values(ascending=False)
training[['rating', 'review_count']] = training[['rating', 'review_count']].apply(lambda x: np.log(x))
training.skew().sort_values(ascending=False)

free_bikes      1.787065
price           1.689884
count           1.108582
distance        0.628690
review_count   -0.494986
rating         -2.777659
dtype: float64

In [106]:
X = pd.DataFrame(training, columns=['review_count', 'distance','rating', 'price', 'count'])
y = pd.Series(training['free_bikes'])
X.head()

Unnamed: 0_level_0,review_count,distance,rating,price,count
stationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,26.87387,766.990127,7.757885,2.074531,18
1,148.220371,290.300182,7.814286,2.08444,14
2,19.760318,917.950983,7.421037,1.94321,14
3,31.663494,125.035372,7.470137,1.87666,7
4,27.022635,445.110529,7.762293,2.01518,9


Provide model output and an interpretation of the results. 

In [109]:
# X = sm.add_constant(X) # adding a constant
lin_reg = sm.OLS(y,X)
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:             free_bikes   R-squared (uncentered):                   0.603
Model:                            OLS   Adj. R-squared (uncentered):              0.601
Method:                 Least Squares   F-statistic:                              211.5
Date:                Mon, 31 Jul 2023   Prob (F-statistic):                   6.13e-137
Time:                        13:44:04   Log-Likelihood:                         -2282.9
No. Observations:                 700   AIC:                                      4576.
Df Residuals:                     695   BIC:                                      4598.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------

# Stretch

How can you turn the regression model into a classification model?