In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

import networkx as nx
import copy
import random
import getpass
import psycopg2 as ps
import os
import re

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.stattools import grangercausalitytests
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import grangercausalitytests


In [2]:
# Connecting to database
df = pd.read_csv()

## Make sure to remove the empty source or target

## We will basically create two time series dfs where the first one is the network where with only one source, and the second is a network the rest of the sources

df_ts_1 = df[df['Source'] == "Enter any unique source"] 
df_ts_2 = df[df['Source'] != "Enter the same source"]

In [46]:
df_ts_2 = df_ts_2.dropna()
df_ts_1 = df_ts_1.groupby('Day').size().dropna()
day_count_rest = df_ts_2.groupby('Day').size().dropna()


In [None]:
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")  # Optional: Set the style of the plot
sns.lineplot(x=df_ts_1.index, y=df_ts_1.values, marker='o', color='b', label='Count of selected source ')
sns.lineplot(x=day_count_rest.index, y=day_count_rest.values, marker='o', color='y', label='Count of not selected source ')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Time Series of Day Counts')
plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for better visibility
plt.tight_layout()
plt.show()

In [48]:
scaler = MinMaxScaler()

# Reshape the data to fit the scaler (required for MinMaxScaler)
day_counts1 = df_ts_1.values.reshape(-1, 1)
day_counts2 = day_count_rest.values.reshape(-1, 1)

# Fit and transform the data with MinMaxScaler
normalized_counts1 = scaler.fit_transform(day_counts1)
normalized_counts2 = scaler.fit_transform(day_counts2)


In [49]:
normalized_ts1 = pd.DataFrame(normalized_counts1, index=df_ts_1.index, columns=['Normalized_Count_source'])
normalized_rest_df = pd.DataFrame(normalized_counts2, index=day_count_rest.index, columns=['Normalized_Count_Rest'])
normalized_ts1.dropna()
normalized_rest_df.dropna()

Unnamed: 0_level_0,Normalized_Count_Rest
Day,Unnamed: 1_level_1
2019-05-20,0.000000
2019-05-21,0.002234
2019-05-22,0.004148
2019-05-23,0.002872
2019-05-24,0.005424
...,...
2022-12-28,0.714422
2022-12-29,0.689853
2022-12-30,0.804403
2022-12-31,0.737077


In [None]:
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")  # Optional: Set the style of the plot
sns.lineplot(x=normalized_ts1.index, y=normalized_ts1['Normalized_Count_source'], marker='o', color='b', label='Count of selected source')
sns.lineplot(x=normalized_rest_df.index, y=normalized_rest_df['Normalized_Count_Rest'], marker='o', color='y', label='Count of non selected source')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Normalized Time Series of Day Counts')
plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for better visibility
plt.tight_layout()
plt.show()

In [76]:
data = pd.concat([ normalized_rest_df['Normalized_Count_Rest'], normalized_ts1['Normalized_Count_source']], axis=1)
data['Normalized_Count_source'] = data['Normalized_Count_source'].fillna(0)
# Perform Granger causality test
max_lag = 100  # Set the maximum lag for the test
results = grangercausalitytests(data, max_lag, verbose=True)

p_values, F_values = [],[]
for lag, result in results.items():
    p_value = result[0]["ssr_ftest"][1]
    f_value =result[0]["ssr_ftest"][0]
    p_values.append(p_value)
    F_values.append(f_value)




Granger Causality
number of lags (no zero) 1
ssr based F test:         F=5.3939  , p=0.0204  , df_denom=1319, df_num=1
ssr based chi2 test:   chi2=5.4061  , p=0.0201  , df=1
likelihood ratio test: chi2=5.3951  , p=0.0202  , df=1
parameter F test:         F=5.3939  , p=0.0204  , df_denom=1319, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.7416  , p=0.1756  , df_denom=1316, df_num=2
ssr based chi2 test:   chi2=3.4965  , p=0.1741  , df=2
likelihood ratio test: chi2=3.4918  , p=0.1745  , df=2
parameter F test:         F=1.7416  , p=0.1756  , df_denom=1316, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.7634  , p=0.5146  , df_denom=1313, df_num=3
ssr based chi2 test:   chi2=2.3025  , p=0.5121  , df=3
likelihood ratio test: chi2=2.3004  , p=0.5124  , df=3
parameter F test:         F=0.7634  , p=0.5146  , df_denom=1313, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.5302  , p=0.

In [77]:
results_df = pd.DataFrame({'Lag (days)': range(1, max_lag + 1), 'P-Value': p_values, "F-Value" : F_values})
results_df.head()

Unnamed: 0,Lag (days),P-Value,F-Value
0,1,0.020359,5.393876
1,2,0.175641,1.741615
2,3,0.514609,0.763414
3,4,0.713606,0.530151
4,5,0.709213,0.588016


In [78]:
print(results_df[results_df['P-Value'] <= 0.05])

   Lag (days)   P-Value   F-Value
0           1  0.020359  5.393876


In [None]:
df_ts_1.set_index('Day', inplace=True)
df_ts_2.set_index('Day', inplace=True)

In [109]:


# Group by 3-day intervals and count the occurrences
day_count_ts1_3d = df_ts_1.resample('3D').size().dropna() ## Change day length here to your liking
day_count_rest_3_d = df_ts_2.resample('3D').size().dropna()

In [110]:

# Reshape the data to fit the scaler (required for MinMaxScaler)
day_counts1_3d = day_count_ts1_3d.values.reshape(-1, 1)
day_counts2_3d = day_count_rest_3_d.values.reshape(-1, 1)

# Fit and transform the data with MinMaxScaler
normalized_counts1_3d = scaler.fit_transform(day_counts1_3d)
normalized_counts2_3d = scaler.fit_transform(day_counts2_3d)

normalized_source_df_3d = pd.DataFrame(normalized_counts1_3d, index=day_counts1_3d.index, columns=['Normalized_Count_source'])
normalized_rest_df_3d = pd.DataFrame(normalized_counts2_3d, index=day_count_rest_3_d.index, columns=['Normalized_Count_Rest'])


In [None]:
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")  # Optional: Set the style of the plot
sns.lineplot(x=normalized_source_df_3d.index, y=normalized_source_df_3d['Normalized_Count_source'], marker='o', color='b', label='Count of selected source ')
sns.lineplot(x=normalized_rest_df_3d.index, y=normalized_rest_df_3d['Normalized_Count_Rest'], marker='o', color='y', label='Count of non selected source')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Normalized Time Series of Edge Counts grouped by 5days')
plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for better visibility
plt.tight_layout()
plt.show()

In [112]:
data = pd.concat([ normalized_rest_df_3d['Normalized_Count_Rest'], normalized_source_df_3d['Normalized_Count_source']], axis=1)
data['Normalized_Count_source'] = data['Normalized_Count_source'].fillna(0)
data['Normalized_Count_Rest'] = data['Normalized_Count_Rest'].fillna(0)

# Perform Granger causality test
max_lag = 100  # Set the maximum lag for the test
results = grangercausalitytests(data, max_lag, verbose=True)

p_values, F_values = [],[]
for lag, result in results.items():
    p_value = result[0]["ssr_ftest"][1]
    f_value =result[0]["ssr_ftest"][0]
    p_values.append(p_value)
    F_values.append(f_value)




Granger Causality
number of lags (no zero) 1
ssr based F test:         F=449.5813, p=0.0000  , df_denom=524, df_num=1
ssr based chi2 test:   chi2=452.1552, p=0.0000  , df=1
likelihood ratio test: chi2=326.4710, p=0.0000  , df=1
parameter F test:         F=449.5813, p=0.0000  , df_denom=524, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=3.3409  , p=0.0362  , df_denom=521, df_num=2
ssr based chi2 test:   chi2=6.7460  , p=0.0343  , df=2
likelihood ratio test: chi2=6.7031  , p=0.0350  , df=2
parameter F test:         F=3.3409  , p=0.0362  , df_denom=521, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.3016  , p=0.0763  , df_denom=518, df_num=3
ssr based chi2 test:   chi2=6.9981  , p=0.0720  , df=3
likelihood ratio test: chi2=6.9519  , p=0.0734  , df=3
parameter F test:         F=2.3016  , p=0.0763  , df_denom=518, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.7859  , p=0.1303  

In [113]:
results_df = pd.DataFrame({'Lag (days)': range(1, max_lag + 1), 'P-Value': p_values, "F-Value" : F_values})
print(results_df[results_df['P-Value'] <= 0.05])

    Lag (days)       P-Value     F-Value
0            1  1.660239e-72  449.581266
1            2  3.616419e-02    3.340928
6            7  6.122400e-03    2.863693
7            8  1.629052e-02    2.372849
8            9  2.555835e-02    2.131677
..         ...           ...         ...
84          85  3.777523e-02    1.349587
85          86  4.738618e-02    1.324844
97          98  1.008656e-02    1.466809
98          99  1.542559e-02    1.427640
99         100  2.207489e-02    1.393564

[66 rows x 3 columns]
