In [62]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
import scipy.stats as stats
from scipy.stats import pearsonr
from tqdm import tqdm
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
df = pd.read_parquet('C://Users/Isi/anaconda3/envs/FireBrigade/MAY24_BDS_INT_Fire_Brigade/data/dataframe_compressed.parquet')

In [5]:
df.head()

Unnamed: 0_level_0,DateTimeCall,TotalResponseTime,CellEastingNorthing100,CellEastingNorthing250,CellEastingNorthing500,CellEastingNorthing1000,CellEastingNorthing5000,CellEastingNorthing10000,DistanceStation,IncidentGroup_Fire,...,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,IsBankholiday,IsWeekend
IncidentNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
235138081,2009-01-01 00:02:27,319.0,528600-176800,528500-176750,528500-176500,528000-176000,525000-175000,520000-170000,1349.649309,0,...,0,0,0,0,0,0,0,0,1,0
2091,2009-01-01 00:04:09,308.0,533700-194400,533750-194250,533500-194000,533000-194000,530000-190000,530000-190000,635.731924,1,...,0,0,0,0,0,0,0,0,1,0
3091,2009-01-01 00:04:57,210.0,507700-182800,507750-182750,507500-182500,507000-182000,505000-180000,500000-180000,379.999487,1,...,0,0,0,0,0,0,0,0,1,0
5091,2009-01-01 00:06:04,233.0,531000-185300,531000-185250,531000-185000,531000-185000,530000-185000,530000-180000,1131.374466,1,...,0,0,0,0,0,0,0,0,1,0
6091,2009-01-01 00:06:30,172.0,529400-185200,529250-185250,529000-185000,529000-185000,525000-185000,520000-180000,534.359746,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
df_numeric = df.select_dtypes(include=[np.number])
df_numeric = df_numeric.drop(df_numeric.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]', 'timedelta64[ns]']).columns)

In [17]:
corr = df_numeric.corr()

In [27]:
def calculate_p_values(df):
    dfcols = pd.DataFrame(columns=df.columns)
    p_matrix = dfcols.transpose().join(dfcols, how='outer')
    for r in tqdm(df.columns, desc="Calculating p-values", leave=False):
        for c in df.columns:
            _, p = pearsonr(df[r], df[c])
            p_matrix[r][c] = round(p, 4)
    return p_matrix

p_values = calculate_p_values(df_numeric)
alpha = 0.05
significant_features = p_values.index[(p_values['TotalResponseTime'] < alpha) & (p_values['TotalResponseTime'] != np.nan)]
significant_corr = corr.loc[significant_features, 'TotalResponseTime']
print("Significant correlations with 'TotalResponseTime':")
print(significant_corr)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  p_matrix[r][c] = round(p, 4)
                                                                       

Significant correlations with 'TotalResponseTime':
AggregatedPropertyCategory_Outdoor        0.093477
AggregatedPropertyCategory_Residential   -0.062038
AggregatedPropertyCategory_Vehicle        0.064869
CellEastingNorthing2500_502500-175000     0.027118
CellEastingNorthing2500_502500-177500     0.010424
                                            ...   
Weekday_1                                 0.002692
Weekday_2                                 0.004970
Weekday_3                                 0.004540
Weekday_4                                 0.012747
Weekday_6                                -0.019970
Name: TotalResponseTime, Length: 346, dtype: float64




In [74]:
significant_corr = significant_corr[:-1]

In [87]:
names = significant_corr.index.tolist()

In [76]:
print(names)

['AggregatedPropertyCategory_Outdoor', 'AggregatedPropertyCategory_Vehicle', 'DistanceStation', 'IncidentGroup_Special Service', 'StopCodeDescription_Secondary Fire', 'StopCodeDescription_Special Service']


In [77]:
X = df_numeric[names]
y = df_numeric['TotalResponseTime']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [79]:
model = LinearRegression()

In [80]:
model.fit(X_train, y_train)

In [81]:
y_pred = model.predict(X_test)

In [82]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [83]:
print(f"Mean squared error: {mse}")
print(f"R2 score: {r2}")

Mean squared error: 15291.540649471111
R2 score: 0.172409924958132


In [84]:
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")
print(f"Root Mean squared error: {np.sqrt(mse)}")

Mean absolute error: 85.26575062563825
Root Mean squared error: 123.65896914284508


In [85]:
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

Model Coefficients: [ 3.56613152e+01  1.15339770e+01  7.46132309e-02  1.64159744e+02
 -1.03470703e+01 -1.41270612e+02]
Model Intercept: 212.56501794680628


In [60]:
pd.crosstab(y_test, y_pred)

col_0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,...,1195.0,1196.0,1196.0,1197.0,1197.0,1197.0,1198.0,1200.0,1200.0,1200.0
TotalResponseTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60.0,1,1,1,1,1,1,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
61.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195.0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1196.0,0,0,0,0,0,0,0,0,0,0,...,0,1,2,0,0,0,0,0,0,0
1197.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,2,0,0,0,0
1198.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [86]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
print("Cross-validated MSE:", -scores.mean())

Cross-validated MSE: 15239.99760751142
