In [230]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import folium
import statsmodels.api as sm
import scipy.stats
import numpy as np
from math import sqrt
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

In [231]:
#set up the metric calculations
def CalcRSqaured(observed, estimated):
    """Calculate the r^2 from a series of observed and estimated target values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    r, p = scipy.stats.pearsonr(observed, estimated)
    R2 = r **2
    
    return R2

def CalcRMSE(observed, estimated):
    """Calculate Root Mean Square Error between a series of observed and estimated values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    res = (observed -estimated)**2
    RMSE = round(sqrt(res.mean()), 3)
    
    return RMSE

In [232]:
#read in the cdatasub from the first week
cdatasub = pd.read_csv("data.csv")
#cdatasub.replace([np.inf, -np.inf], np.nan, inplace=True)
#cdatasub.dropna(inplace=True)
cdatasub = cdatasub[~cdatasub['station_destination'].str.contains('Battersea Park')]


In [214]:
cdatasub.head(10)

Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,unconstrainedEst1,log_population,log_jobs,log_distance,unconstrainedEst2,fitted
0,Abbey Road,Bank and Monument,0,599.1,78549.1,8131.625097,10,6.395429,11.271479,9.003516,50,49.622673
1,Abbey Road,Beckton,1,599.1,442.1,8510.221774,0,6.395429,6.091536,9.049023,1,0.937341
2,Abbey Road,Blackwall,3,599.1,665.1,3775.548872,0,6.395429,6.499937,8.236301,2,2.121621
3,Abbey Road,Canary Wharf,1,599.1,58772.1,5086.61422,18,6.395429,10.981423,8.534368,53,53.30206
4,Abbey Road,Canning Town,37,599.1,15428.1,2229.023167,25,6.395429,9.643946,7.709319,32,32.210949
5,Abbey Road,Crossharbour,1,599.1,1208.1,6686.57556,0,6.395429,7.096804,8.807857,2,2.340318
6,Abbey Road,Custom House,0,599.1,845.1,3824.95563,0,6.395429,6.739455,8.249302,3,2.525158
7,Abbey Road,Cutty Sark,2,599.1,1748.1,8503.998909,0,6.395429,7.466285,9.048292,3,2.668811
8,Abbey Road,Cyprus,7,599.1,850.1,6532.199618,0,6.395429,6.745354,8.784499,2,1.817492
9,Abbey Road,Devons Road,1,599.1,611.1,3958.424171,0,6.395429,6.415261,8.283601,2,1.931506


In [233]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
dbl_form = 'flows ~ station_origin + station_destination + log_jobs + log_distance-1'

doubSim = smf.glm(formula=dbl_form, data=cdatasub, family=sm.families.Poisson()).fit(method="lbfgs")

print(doubSim.summary())





                 Generalized Linear Model Regression Results                  
Dep. Variable:                  flows   No. Observations:                61413
Model:                            GLM   Df Residuals:                    60617
Model Family:                 Poisson   Df Model:                          795
Link Function:                    Log   Scale:                          1.0000
Method:                         lbfgs   Log-Likelihood:            -9.7129e+05
Date:                Wed, 24 Apr 2024   Deviance:                   1.7704e+06
Time:                        17:22:21   Pearson chi2:                 2.48e+06
No. Iterations:                   100   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

And the various flows and goodness-of-fit statistics?

In [234]:
#get the estimates
cdatasub["doubsimfitted"] = np.round(doubSim.mu)
#here's the matrix
cdatasubmat7 = cdatasub.pivot_table(values ="doubsimfitted", index="station_origin", columns = "station_destination",
                                    aggfunc=np.sum, margins=True)
cdatasubmat7

  cdatasubmat7 = cdatasub.pivot_table(values ="doubsimfitted", index="station_origin", columns = "station_destination",
  cdatasubmat7 = cdatasub.pivot_table(values ="doubsimfitted", index="station_origin", columns = "station_destination",
  cdatasubmat7 = cdatasub.pivot_table(values ="doubsimfitted", index="station_origin", columns = "station_destination",


station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,26.0,591.0
Acton Central,,,,,,,,,,,...,,,,,,,2.0,,,1223.0
Acton Town,,,,15.0,15.0,,11.0,2.0,,17.0,...,31.0,3.0,5.0,12.0,,2.0,,2.0,,3757.0
Aldgate,,,2.0,,40.0,,,0.0,,19.0,...,7.0,,2.0,2.0,,1.0,,1.0,,2772.0
Aldgate East,,,2.0,49.0,,,1.0,0.0,,21.0,...,8.0,1.0,3.0,2.0,,1.0,,1.0,,3131.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,7.0,31.0,33.0,,,,,29.0,...,29.0,,10.0,,,,,,,4865.0
Woodgrange Park,,5.0,,,,,,,,,...,,,,,,,,,,513.0
Woodside Park,,,6.0,18.0,17.0,,3.0,,,22.0,...,22.0,,6.0,,,,,,,3132.0
Woolwich Arsenal,19.0,,,,,27.0,,,,,...,,,,,,,,,,7859.0


In [236]:
alpha_i = doubSim.params[0:397]
alpha_j = doubSim.params[398:793]
gamma = doubSim.params[794]
beta = -doubSim.params[795]

  gamma = doubSim.params[794]
  beta = -doubSim.params[795]


In [237]:
CalcRSqaured(cdatasub["flows"],cdatasub["doubsimfitted"])

0.4059713222925471

In [238]:
CalcRMSE(cdatasub["flows"],cdatasub["doubsimfitted"])

101.542