In [12]:
#Packages for general code running
import warnings #to ignore warning codes
warnings.filterwarnings('ignore')
import zipfile # to unzip zip files

# Packages for data processing and manipulation
import pandas as pd   # for data manipulation
import numpy as np    # for numerical processing
import glob           # to retrieve files matching a specified pattern
import geopandas as gpd #for handling geodataframes
from shapely.geometry import Point
from dateutil.relativedelta import relativedelta
from datetime import datetime
from datetime import date  # for date manipulation

# Packages for visualisation
import seaborn as sns  # for advance data visualisation
import matplotlib.pyplot as plt # for basic data visualisation

# Packages for analysis
from statsmodels.formula.api import ols # for did analysis
import scipy.stats as stats # for statistical analysis
from scipy.stats import ttest_ind # for t-test analysis
from statsmodels.iolib.summary2 import summary_col 

import re

import statsmodels.api as sm


In [13]:
temp = pd.read_csv('testing_temp/london0_23_weather_data.csv')
tube_temp = pd.read_csv('testing_temp/lu-average-monthly-temperatures.csv')

In [14]:
temp.columns

Index(['date', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun'],
      dtype='object')

In [15]:
temp.dtypes

date     object
tavg    float64
tmin    float64
tmax    float64
prcp    float64
snow    float64
wdir    float64
wspd    float64
wpgt    float64
pres    float64
tsun    float64
dtype: object

In [16]:
temp['date'] = pd.to_datetime(temp['date'], format='%d/%m/%Y')
# temp['formatted_date'] = temp['date'].dt.strftime('%A %d/%m/%Y')

# Extract year, month, day of week, weekend data
interest_years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2022]  # define the years of interest
temp['year'] = temp['date'].dt.strftime('%Y').astype(int)
temp = temp[temp['year'].isin(interest_years)]
temp['month'] = temp['date'].dt.strftime('%m').astype(int)
months_to_keep = [6, 7, 8]
temp = temp[temp['month'].isin(months_to_keep)]

average_monthly_tavg = temp.groupby(['year', 'month'])['tavg'].mean().reset_index()
average_monthly_tmax = temp.groupby(['year', 'month'])['tmax'].mean().reset_index()

merged_df = pd.merge(average_monthly_tavg, average_monthly_tmax, on=['year', 'month'], suffixes=('_tavg', '_tmax'))

In [17]:
tube_temp

month_to_number = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

tube_temp['Month'] = tube_temp['Month'].map(month_to_number)


In [18]:
tube_temp

Unnamed: 0,Year,Month,Bakerloo,Central,Jubilee,Northern,Piccadilly,Victoria,Waterloo_and_City,Sub-surface_lines
0,2013,1,21.70,21.00,16.40,19.50,19.30,18.40,16.40,11.90
1,2013,2,21.50,21.00,17.90,19.80,19.00,18.30,17.10,11.30
2,2013,3,21.70,20.90,18.20,20.00,18.70,17.70,16.70,11.40
3,2013,4,23.40,22.80,20.30,21.40,20.30,19.60,18.40,15.30
4,2013,5,25.40,24.70,21.90,22.90,22.40,21.50,20.30,18.30
...,...,...,...,...,...,...,...,...,...,...
115,2022,8,30.40,31.32,26.86,29.15,28.39,31.18,24.96,27.39
116,2022,9,29.21,29.62,24.97,28.14,26.93,30.84,23.43,23.32
117,2022,10,27.58,27.97,23.38,26.93,25.33,29.89,22.21,21.14
118,2022,11,26.02,25.82,21.27,25.31,23.43,27.32,20.27,17.50


In [19]:
temp.drop(['tmin', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt','pres', 'tsun'], axis = 1, inplace = True)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Merge the dataframes based on 'Year' and 'Month'
merged_data = pd.merge(tube_temp, temp, left_on=['Year', 'Month'], right_on=['year', 'month'])

# Get unique tube lines
tube_lines = ['Bakerloo', 'Central', 'Jubilee', 'Northern', 'Piccadilly', 'Victoria', 'Waterloo_and_City', 'Sub-surface_lines']

# Dictionary to store regression models and their evaluation metrics
reg_models = {}
metrics = {}

# Prepare data and fit regression model for each tube line
for tube_line in tube_lines:
    # Filter data for the current tube line
    data_for_tube_line = merged_data[merged_data[tube_line].notnull()]
    
    # Prepare data for regression
    X = data_for_tube_line.drop(['tavg', 'Year', 'Month', 'year', 'month', 'date', tube_line], axis=1)  # Features (tube temperatures)
    y = data_for_tube_line[tube_line]  # Target variable (tube temperature)
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train the regression model
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = reg_model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the regression model and metrics in the dictionary
    reg_models[tube_line] = reg_model
    metrics[tube_line] = {'Mean Squared Error': mse, 'R-squared': r2}

# Print the evaluation metrics for each tube line
for tube_line, metric_values in metrics.items():
    print(f"Tube Line: {tube_line}")
    print("Mean Squared Error:", metric_values['Mean Squared Error'])
    print("R-squared:", metric_values['R-squared'])
    print("-------------------------")

Tube Line: Bakerloo
Mean Squared Error: 0.0701472195877026
R-squared: 0.9451930058567349
-------------------------
Tube Line: Central
Mean Squared Error: 0.019012844039134624
R-squared: 0.9795068031671504
-------------------------
Tube Line: Jubilee
Mean Squared Error: 0.0369061648082325
R-squared: 0.9543217148810883
-------------------------
Tube Line: Northern
Mean Squared Error: 0.1275338515625483
R-squared: 0.9123102900878262
-------------------------
Tube Line: Piccadilly
Mean Squared Error: 0.038200229310888456
R-squared: 0.9628999649126017
-------------------------
Tube Line: Victoria
Mean Squared Error: 0.1735291470758057
R-squared: 0.9486198358676871
-------------------------
Tube Line: Waterloo_and_City
Mean Squared Error: 0.042511917228633504
R-squared: 0.9531697627768843
-------------------------
Tube Line: Sub-surface_lines
Mean Squared Error: 0.11272215144824532
R-squared: 0.9397691875157601
-------------------------


In [None]:
# #OVERALL MODEL TO PREDICT 

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score

# temp = temp.drop(['formatted_date'], axis=1)

# # Merge the dataframes based on 'Year' and 'Month'
# merged_data = pd.merge(tube_temp, temp, left_on=['Year', 'Month'], right_on=['year', 'month'])

# # Check for missing values in 'X' and 'y'
# print("Missing values in 'X':", merged_data.drop(['tavg', 'Year', 'Month', 'year', 'month'], axis=1).isnull().sum())
# print("Missing values in 'y':", merged_data['tavg'].isnull().sum())

# # Prepare data for regression
# X = merged_data.drop(['tavg', 'Year', 'Month', 'year', 'month', 'date'], axis=1)  # Features (tube temperatures)
# y = merged_data['tavg']  # Target variable (average temperature)

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create and train the regression model
# reg_model = LinearRegression()
# reg_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = reg_model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Mean Squared Error:", mse)
# print("R-squared:", r2)

In [None]:
# import matplotlib.pyplot as plt

# # Plot the actual average temperatures against the predicted average temperatures for the test set
# plt.scatter(y_test, y_pred, alpha=0.5)
# plt.xlabel('Actual Average Temperature')
# plt.ylabel('Predicted Average Temperature')
# plt.title('Actual vs. Predicted Average Temperature')
# plt.show()

In [None]:
# # Calculate the residuals (difference between actual and predicted values)
# residuals = y_test - y_pred

# # Plot a histogram of the residuals
# plt.hist(residuals, bins=30)
# plt.xlabel('Residuals')
# plt.ylabel('Frequency')
# plt.title('Histogram of Residuals')
# plt.show()

In [40]:
from sklearn.impute import SimpleImputer

# Define the range of temperatures from 15°C to 28°C
temperature_range = np.arange(15, 34)

# Dictionary to store regression models and their evaluation metrics
reg_models = {}
metrics = {}

# Prepare data and fit regression model for each tube line
for tube_line in tube_lines:
    # Filter data for the current tube line
    data_for_tube_line = merged_data[merged_data[tube_line].notnull()]
    
    # Prepare data for regression
    X = data_for_tube_line[['tmax']]  # Features (tube temperatures)
    y = data_for_tube_line[tube_line]  # Target variable (tube temperature)
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train the regression model
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = reg_model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the regression model and metrics in the dictionary
    reg_models[tube_line] = reg_model
    metrics[tube_line] = {'Mean Squared Error': mse, 'R-squared': r2}

# Print the evaluation metrics for each tube line
for tube_line, metric_values in metrics.items():
    print(f"Tube Line: {tube_line}")
    print("Mean Squared Error:", metric_values['Mean Squared Error'])
    print("R-squared:", metric_values['R-squared'])
    print("-------------------------")

# Dictionary to store estimated tube temperatures for each tube line
estimated_temperatures = {}

# Loop through each tube line and use the corresponding regression model to predict temperatures
for tube_line, reg_model in reg_models.items():
    # Prepare data for prediction by creating a DataFrame with the temperature range
    X_pred = pd.DataFrame({'tmax': temperature_range})
    
    # Make predictions using the regression model
    y_pred = reg_model.predict(X_pred)
    
    # Store the estimated tube temperatures in the dictionary
    estimated_temperatures[tube_line] = y_pred

# Print the estimated tube temperatures for each tube line
for tube_line, temps in estimated_temperatures.items():
    print(f"Tube Line: {tube_line}")
    for temp, estimated_temp in zip(temperature_range, temps):
        print(f"Temperature: {temp}°C, Estimated Tube Temperature: {estimated_temp:.2f}°C")
    print("-------------------------")


Tube Line: Bakerloo
Mean Squared Error: 1.2767884552951672
R-squared: 0.0024274974425615437
-------------------------
Tube Line: Central
Mean Squared Error: 0.9081621650343511
R-squared: 0.021127719457017413
-------------------------
Tube Line: Jubilee
Mean Squared Error: 0.7805991805341135
R-squared: 0.033862442296567985
-------------------------
Tube Line: Northern
Mean Squared Error: 1.391977816032876
R-squared: 0.04290406510431832
-------------------------
Tube Line: Piccadilly
Mean Squared Error: 1.0008525406468955
R-squared: 0.027972736155076117
-------------------------
Tube Line: Victoria
Mean Squared Error: 3.190193664057753
R-squared: 0.0554170475951955
-------------------------
Tube Line: Waterloo_and_City
Mean Squared Error: 0.931209784411831
R-squared: -0.025801185911235702
-------------------------
Tube Line: Sub-surface_lines
Mean Squared Error: 1.7450075751106302
R-squared: 0.06759032994217529
-------------------------
Tube Line: Bakerloo
Temperature: 15°C, Estimated Tu

In [39]:
estimated_temperatures = {}

# Loop through each tube line and use the corresponding regression model to predict temperatures
for tube_line, reg_model in reg_models.items():
    # Prepare data for prediction by creating a DataFrame with the temperature range
    X_pred = pd.DataFrame({'tmax': temperature_range})
    
    # Make predictions using the regression model
    y_pred = reg_model.predict(X_pred)
    
    # Store the estimated tube temperatures in the dictionary
    estimated_temperatures[tube_line] = y_pred

# Create a DataFrame to store the estimated tube temperatures for each tube line
estimated_temperatures_df = pd.DataFrame(estimated_temperatures, index=temperature_range)

# Print the estimated tube temperatures DataFrame
print(estimated_temperatures_df)

# Save the DataFrame to a CSV file
estimated_temperatures_df.to_csv('estimated_tube_temperatures.csv', index_label='Temperature (°C)')

     Bakerloo    Central    Jubilee   Northern  Piccadilly   Victoria  \
15  28.964549  28.712314  24.823401  26.609413   25.809086  26.639509   
16  29.080916  28.850385  24.954821  26.709746   25.937696  26.775989   
17  29.197283  28.988457  25.086240  26.810079   26.066306  26.912469   
18  29.313650  29.126529  25.217659  26.910411   26.194916  27.048949   
19  29.430017  29.264601  25.349079  27.010744   26.323526  27.185429   
20  29.546384  29.402673  25.480498  27.111076   26.452136  27.321909   
21  29.662751  29.540744  25.611917  27.211409   26.580747  27.458389   
22  29.779118  29.678816  25.743336  27.311742   26.709357  27.594869   
23  29.895485  29.816888  25.874756  27.412074   26.837967  27.731349   
24  30.011852  29.954960  26.006175  27.512407   26.966577  27.867829   
25  30.128219  30.093032  26.137594  27.612739   27.095187  28.004309   
26  30.244586  30.231103  26.269014  27.713072   27.223797  28.140788   
27  30.360953  30.369175  26.400433  27.813405   27