Build a regression model.

In [59]:
from scipy import stats
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [45]:
df_agg = pd.read_csv("aggregated_poi_info_for_100_BikeStns.csv")

df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   station_name                100 non-null    object 
 1   poi_distance_from_bike_stn  100 non-null    float64
 2   free_bikes                  100 non-null    float64
 3   review_count                100 non-null    float64
 4   rating                      100 non-null    float64
dtypes: float64(4), object(1)
memory usage: 4.0+ KB


Provide model output and an interpretation of the results. 

In [46]:
stat, p = stats.shapiro(df_agg['poi_distance_from_bike_stn'])
p
#distance from bike stations to food outlets doesn't follow normal distribution as p value < 0.05

1.506138005424873e-06

In [47]:
stat, p = stats.shapiro(df_agg['free_bikes'])
p
#Bike capacity distribution doesn't follow normal distribution as p value < 0.05

1.815253312997811e-06

In [48]:
stat, p = stats.shapiro(df_agg['review_count'])
p
#Number of reviews for food outlets doesn't follow normal distribution as p value < 0.05

3.4687441807790265e-09

In [61]:
stat, p = stats.shapiro(df_agg['rating'])
print(f"
#Number of reviews for food outlets doesn't follow normal distribution as p value < 0.05

6.896773265907541e-05

In [60]:
df_agg.corr()

Unnamed: 0,poi_distance_from_bike_stn,free_bikes,review_count,rating
poi_distance_from_bike_stn,1.0,0.004724,-0.273079,0.162732
free_bikes,0.004724,1.0,0.198236,0.101862
review_count,-0.273079,0.198236,1.0,-0.013453
rating,0.162732,0.101862,-0.013453,1.0


In [69]:
import statsmodels.api as sm

In [74]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   station_name                100 non-null    object 
 1   poi_distance_from_bike_stn  100 non-null    float64
 2   free_bikes                  100 non-null    float64
 3   review_count                100 non-null    float64
 4   rating                      100 non-null    float64
dtypes: float64(4), object(1)
memory usage: 4.0+ KB


In [88]:
#buiding model between number of reviews and distance from bike station, number of bikes
# and restaurant rating.

y = df_agg['review_count']

indep = df_agg.drop(['review_count', 'station_name'], axis=1)

#Create a model for each indep. variable
#list of X's (with constants)
X = [sm.add_constant(indep[column]) for column in indep.columns]

In [76]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of p-values

In [77]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {indep.columns[i]}')

adj_R2: 0.065, P-values: (3.121634850069516e-13, 0.0059794480505248535), column: poi_distance_from_bike_stn
adj_R2: 0.029, P-values: (8.223025728361785e-13, 0.04802958661641191), column: free_bikes
adj_R2: -0.010, P-values: (0.46728944870837175, 0.8943198927640976), column: rating


In [None]:
#A very low adjusted R square of 0.065 shows that there is very relationship between the 
#dependent variable 'review_count' and other variables.

#Lets try with rating as dependent variable and others as independent variables.

In [89]:
#buiding model between rating and distance from bike station, number of bikes
# and number of reviews.

y = df_agg['rating']

indep = df_agg.drop(['rating', 'station_name'], axis=1)

#Create a model for each indep. variable
#list of X's (with constants)
X = [sm.add_constant(indep[column]) for column in indep.columns]

In [90]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of p-values

In [91]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {indep.columns[i]}')

adj_R2: 0.017, P-values: (3.1787592432447697e-97, 0.10573470932056468), column: poi_distance_from_bike_stn
adj_R2: 0.000, P-values: (5.6640304004255806e-120, 0.3132453669923799), column: free_bikes
adj_R2: -0.010, P-values: (1.2074840246801219e-113, 0.894319892764095), column: review_count


In [None]:
#A very low adjusted R square of 0.017 shows that there is very relationship between the 
#dependent variable 'rating' and other variables.

#Hence it may be inferred that these variables are not related to each other and there is no predictive model
#which may be constructed using them.

# Stretch

How can you turn the regression model into a classification model?