Build a regression model.

In [None]:
#number of cafes/bars ~ number of bike stations

# import numpy
import numpy as np
import pandas as pd

In [None]:
# import linear_model and datasets from sklearn
from sklearn import linear_model, datasets
#from sklearn.datasets import fetch_california_housing

#load from a CSV
bikestations = pd.read_csv("algira.csv")
bars_cafes = pd.read_csv("foursquare.csv")

In [None]:
#check csv data column names
print(bikestations.head())
print(bars_cafes.head())

   Unnamed: 0                          name  longitude   latitude
0           0                    Cineteatro  -8.630230  39.209869
1           1           Pavilhão Desportivo  -8.624431  39.205402
2           2          Biblioteca Municipal  -8.621210  39.207352
3           3                 Parque Urbano  -8.622314  39.214693
4           4  Piscinas & Estádio Municipal  -8.617121  39.211938
   Unnamed: 0              name  category        lat       lng
0           0      Café Império       Bar  39.209374 -8.630684
1           1   Boutique do Chá  Tea Room  39.210251 -8.629609
2           2     Caffe Central      Café  39.209143 -8.629995
3           3  Casa de Chá Chic      Café  39.209287 -8.629253
4           4       3A É de Vez       Bar  39.209736 -8.631633


In [None]:
# grid size
lat_grid_size = 0.01  # latitude grid size 1.1km
lon_grid_size = 0.01  # longitude grid size 1.1km

# column each point belongs to
bikestations['grid_lat'] = np.floor(bikestations['latitude'] / lat_grid_size)
bikestations['grid_lon'] = np.floor(bikestations['longitude'] / lon_grid_size)
#bikestations['grid_id'] = bikestations['grid_lat'].astype(str) + "_" + bikestations['grid_lon'].astype(str)

bars_cafes['grid_lat'] = np.floor(bars_cafes['lat'] / lat_grid_size)
bars_cafes['grid_lon'] = np.floor(bars_cafes['lng'] / lon_grid_size)
#bars_cafes['grid_id'] = bars_cafes['grid_lat'].astype(str) + "_" + bars_cafes['grid_lon'].astype(str)

# preview first rows
print(bikestations.head())
print(bars_cafes.head())

   Unnamed: 0                          name  longitude   latitude  grid_lat  \
0           0                    Cineteatro  -8.630230  39.209869    3920.0   
1           1           Pavilhão Desportivo  -8.624431  39.205402    3920.0   
2           2          Biblioteca Municipal  -8.621210  39.207352    3920.0   
3           3                 Parque Urbano  -8.622314  39.214693    3921.0   
4           4  Piscinas & Estádio Municipal  -8.617121  39.211938    3921.0   

   grid_lon        grid_id  
0    -864.0  3920.0_-864.0  
1    -863.0  3920.0_-863.0  
2    -863.0  3920.0_-863.0  
3    -863.0  3921.0_-863.0  
4    -862.0  3921.0_-862.0  
   Unnamed: 0              name  category        lat       lng  grid_lat  \
0           0      Café Império       Bar  39.209374 -8.630684    3920.0   
1           1   Boutique do Chá  Tea Room  39.210251 -8.629609    3921.0   
2           2     Caffe Central      Café  39.209143 -8.629995    3920.0   
3           3  Casa de Chá Chic      Café  39.2

In [89]:
#bike stations and bars/cafes per grid cell
bike_station_count = bikestations.groupby('grid_id').size().reset_index(name='bike_station_count')
bar_cafes_count = bars_cafes.groupby('grid_id').size().reset_index(name='bar_cafe_count')

#merge one dataframe on the grid_id
grid_data = pd.merge(bike_station_count, bar_cafes_count, on='grid_id', how='outer').fillna(0)

# preview new DF
print(grid_data.head())

         grid_id  bike_station_count  bar_cafe_count
0  3920.0_-862.0                 0.0               5
1  3920.0_-863.0                 5.0              24
2  3920.0_-864.0                 2.0               9
3  3921.0_-862.0                 1.0               7
4  3921.0_-863.0                 1.0              15


In [90]:
import statsmodels.api as sm

In [91]:
print(grid_data.columns)

Index(['grid_id', 'bike_station_count', 'bar_cafe_count'], dtype='object')


In [None]:

#define features (X) and target (y)
X = grid_data[['bike_station_count']]  # Independent variable: number of bike stations
y = grid_data['bar_cafe_count']  # Dependent variable: number of bars/cafes

#add constant 
X = sm.add_constant(X)

#fit regression model
reg_model = sm.OLS(y,X).fit()


#regression model results
print(reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:         bar_cafe_count   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     15.61
Date:                Wed, 16 Apr 2025   Prob (F-statistic):             0.0168
Time:                        18:55:10   Log-Likelihood:                -15.828
No. Observations:                   6   AIC:                             35.66
Df Residuals:                       4   BIC:                             35.24
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  4.2952      2

  warn("omni_normtest is not valid with less than 8 observations; %i "


Provide model output and an interpretation of the results. 

In [None]:
#regression model results
print(reg_model.summary())

                            OLS Regression Results                            
Dep. Variable:         bar_cafe_count   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     15.61
Date:                Wed, 16 Apr 2025   Prob (F-statistic):             0.0168
Time:                        18:55:38   Log-Likelihood:                -15.828
No. Observations:                   6   AIC:                             35.66
Df Residuals:                       4   BIC:                             35.24
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  4.2952      2

  warn("omni_normtest is not valid with less than 8 observations; %i "


# Stretch

How can you turn the regression model into a classification model?

In [None]:
2.7 Regression vs. Classification
Classification and regression algorithms can both be used for forecasting in statistics (as well as in machine learning) and operate with labelled datasets.

The key distinction between classification and regression algorithms is that regression algorithms are used to determine continuous values such as price, income, age, etc. whereas classification algorithms are used to forecast or classify distinct values such as true or false, spam or not spam, etc.

When to Apply a Regression Model
Regression algorithms are used to determine continuous values such as price, income, age, etc.

When to Apply a Classification Model
Classification algorithms are used to forecast or classify distinct values such as True or False, Spam or Not Spam, etc