# start_pakistan_correlations
## HW_computeCorrelDailyTemps.ipynb
This script computes correlation between locations based on the GFS daily 2-meter temperature values. The 2004-2022 dataset was provided by Ross Maidment (University of Reading).

In [72]:
from pathlib import Path
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import date, timedelta
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats.stats import pearsonr

  from scipy.stats.stats import pearsonr


In [2]:
# Set the root path
rootPath = Path('C:/Users/alexa/Documents/02_work/02_start/02_deliv/05_pk_correlation/hw/data')

## Define the monitoring window
April to July inclusive

In [3]:
months = list(range(4,8))
months

[4, 5, 6, 7]

## Load the daily GFS data

In [4]:
dataPath = rootPath/'city_extracts_2023'
sites = [item.stem for item in list(dataPath.iterdir())]

In [33]:
# Loop through sites and get the daily values

# Record daily date for all sites in df
startDate=date(2004,4,1)
endDate=date(2022,12,31)
allDailyData = pd.DataFrame(data=[pd.date_range(startDate, endDate, freq='d')],index=['time']).T
allDailyData.index = allDailyData.time
allDailyData = allDailyData.drop('time', axis=1)
keepCols=['t2m_cel']

# Loop through sites
for i, site in enumerate(sites):
    
    # Read the daily data
    siteFile = dataPath/'{0}/gfsanl_daily_{1}_allyears.csv'.format(site, site)
    siteData = pd.read_csv(siteFile)
    siteData.index = pd.to_datetime(siteData.time)
    siteData = siteData[keepCols]
    
    # # Filter for monitoring months
    # siteData.index = pd.to_datetime(siteData.time)
    # siteData['month'] = [item.month for item in siteData.index]
    # siteDataSub = siteData[siteData.month.isin(months)]
    
    # Merge the site data into the allDailyData dataframe
    allDailyData = allDailyData.merge(siteData, left_index=True, right_index=True, how='outer')
    
allDailyData.columns=sites
allDailyData['month'] = [item.month for item in allDailyData.index]

  allDailyData = allDailyData.merge(siteData, left_index=True, right_index=True, how='outer')
  allDailyData = allDailyData.merge(siteData, left_index=True, right_index=True, how='outer')


## Compute correlation of daily values for all city pairs

In [52]:
# Create sitePairs
sites1=[]
sites2=[]
for i, site in enumerate(sites):
    sites1.append([site]*len(sites))
    sites2.append(sites)
sites1 = [item for sublist in sites1 for item in sublist]
sites2 = [item for sublist in sites2 for item in sublist]

In [85]:
# Loop through sites, for each, compute and record the correlation - use monitoring months only
correlsData=pd.DataFrame(data=[], index=['site1','site2','correlAllMonths','correlSubsetMonths']).T
correlsResultData=pd.DataFrame(data=[], index=['site1','site2','pearsonAllMonths','pAllMonths', 'pearsonSubsetMonths','pSubsetMonths']).T

for i in range(0, len(sites1)):
    
    # Get data for both sites
    site1=allDailyData[sites1[i]]
    site2=allDailyData[sites2[i]]
    
    # Compute correlation    
    correl=site1.corr(site2)
    correlResult = list(pearsonr(site1, site2)) # (Pearson's correlation coefficient, 2-tailed p-value)
    
    # Compute correlation again on only monitoring months
    site1=allDailyData[allDailyData.month.isin(months)][sites1[i]]
    site2=allDailyData[allDailyData.month.isin(months)][sites2[i]]
    correlWindow=site1.corr(site2)
    correlResultWindow = list(pearsonr(site1, site2)) # (Pearson's correlation coefficient, 2-tailed p-value)
    
    
    # Record in dataframe
    correlData = pd.DataFrame(data=[sites1[i], sites2[i], correl, correlWindow], index=correlsData.columns).T
    correlsData = pd.concat(objs=[correlsData, correlData])
    
    correlResultData  = pd.DataFrame(data=[sites1[i], sites2[i]] + correlResult + correlResultWindow, index=correlsResultData.columns).T
    correlsResultData = pd.concat(objs=[correlsResultData, correlResultData])

In [69]:
# Pivot to create a matrix
correlsMatrix=correlsData.pivot(index='site1', columns='site2', values='correlSubsetMonths')
correlsMatrixAllMonths=correlsData.pivot(index='site1', columns='site2', values='correlAllMonths')
correlsMatrix

site2,Jacobabad,Karachi_Jinnah_Airport,Lahore,Multan,Nawabshah,Sibi
site1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Jacobabad,1.0,0.551218,0.873296,0.938699,0.757113,0.918346
Karachi_Jinnah_Airport,0.551218,1.0,0.483066,0.515729,0.671789,0.506875
Lahore,0.873296,0.483066,1.0,0.929401,0.606051,0.828083
Multan,0.938699,0.515729,0.929401,1.0,0.643561,0.894327
Nawabshah,0.757113,0.671789,0.606051,0.643561,1.0,0.687022
Sibi,0.918346,0.506875,0.828083,0.894327,0.687022,1.0


In [71]:
# Write out as csv
outPath = rootPath/'correlation'
outPath.mkdir(exist_ok=True)
correlsMatrix.to_csv(outPath/'correlsMatrix.csv')
correlsMatrixAllMonths.to_csv(outPath/'correlsMatrixAllMonths.csv')

### Check statistical significance of the correlations
All the p-values are essentially zero, so high statistical significance. This is because we are using several thousand data points in each series where correlaiton is computed.

In [90]:
correlsResultData

Unnamed: 0,site1,site2,pearsonAllMonths,pAllMonths,pearsonSubsetMonths,pSubsetMonths
0,Jacobabad,Jacobabad,1.0,0.0,1.0,0.0
0,Jacobabad,Karachi_Jinnah_Airport,0.901031,0.0,0.551218,0.0
0,Jacobabad,Lahore,0.971893,0.0,0.873296,0.0
0,Jacobabad,Multan,0.987831,0.0,0.938699,0.0
0,Jacobabad,Nawabshah,0.968995,0.0,0.757113,0.0
0,Jacobabad,Sibi,0.982571,0.0,0.918346,0.0
0,Karachi_Jinnah_Airport,Jacobabad,0.901031,0.0,0.551218,0.0
0,Karachi_Jinnah_Airport,Karachi_Jinnah_Airport,1.0,0.0,1.0,0.0
0,Karachi_Jinnah_Airport,Lahore,0.873734,0.0,0.483066,0.0
0,Karachi_Jinnah_Airport,Multan,0.882881,0.0,0.515729,0.0


### Note
These are Pearson r correlaiton values. Can get the coefficient of determination by squaring (i.e., r-squared)