In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from statsmodels.stats.outliers_influence import variance_inflation_factor

data = pd.read_excel('/Data/Sameday.xlsx')

# Drop missing values
df = data.dropna()
df.head()

Unnamed: 0,Date,Sin(Day of Year),Cos(day of Year),E.coli,Water Temperature,Turbidity,Specific Conductance,Streamflow,Gage Height,Upstream Tributary Streamflow,...,contagion,area_mn,perimeter_mn,shape_index_mn,fractal_dimension_mn,euclidean_nearest_neighbor_mn,time_diff_days_global,month,season,Study_Area
0,05/14/2007,0.741222,-0.67126,35,14.1,2.8,54,23.757795,0.585216,0.972682,...,49.381391,6.454418,1291.116919,1.434223,1.06448,112.752503,4,5,2,1
1,05/16/2007,0.717677,-0.696376,30,11.6,3.6,51,30.865312,0.71628,1.046306,...,49.381391,6.454418,1291.116919,1.434223,1.06448,112.752503,2,5,2,1
2,05/17/2007,0.705584,-0.708627,77,11.8,5.2,51,31.714816,0.73152,0.988823,...,49.381391,6.454418,1291.116919,1.434223,1.06448,112.752503,1,5,2,1
3,05/30/2007,0.53073,-0.847541,150,11.9,8.5,47,65.694976,1.197864,0.678754,...,49.381391,6.454418,1291.116919,1.434223,1.06448,112.752503,7,5,2,1
4,05/31/2007,0.516062,-0.856551,80,11.7,7.2,47,68.526656,1.240536,0.69631,...,49.381391,6.454418,1291.116919,1.434223,1.06448,112.752503,1,5,2,1


In [2]:
# Define target variable
y = df["E.coli"]
X = df.drop(columns=["E.coli", "Date"])

In [12]:
#  1. Spearman's Correlation
def spearman_pvalue_selection(X, y, correlation_threshold=0.3, p_threshold=0.01):
    spearman_results = {col: spearmanr(X[col], y) for col in X.columns}
    spearman_df = pd.DataFrame.from_dict(spearman_results, orient='index', columns=['Spearman Correlation', 'p-value'])
    spearman_df["Abs_Spearman"] = spearman_df["Spearman Correlation"].abs()

    selected_features = spearman_df[
        (spearman_df["Abs_Spearman"] > correlation_threshold) &
        (spearman_df["p-value"] < p_threshold)
    ].index.tolist()

    spearman_df.to_csv("spearman_feature_selection.csv")
    return selected_features

selected_features_spearman = spearman_pvalue_selection(X, y)

pd.DataFrame(selected_features_spearman, columns=['Selected_Features']).to_csv('spearman_selected_features.csv', index=False)
# Show correlation + p-values for selected features
print(" Spearman Correlation and p-values for Selected Features:")
print(spearman_df.loc[selected_features_spearman])

📊 Spearman Correlation and p-values for Selected Features:
                               Spearman Correlation       p-value  \
Water Temperature                          0.453690  1.110625e-52   
Turbidity                                  0.572761  1.529431e-89   
Upstream Tributary Streamflow              0.490050  1.978507e-62   
Relative Humidity                          0.333267  9.442984e-28   
Total Area                                -0.344885  1.004749e-29   
Agricultural Area                         -0.310309  4.281190e-24   
Imperviousness                             0.328536  5.684005e-27   
Elevation                                 -0.380905  2.156787e-36   
Slope                                      0.380905  2.156787e-36   
Topographic Wetness Index                 -0.380905  2.156787e-36   
entropy                                   -0.328730  5.282766e-27   
shannon_diversity_index                   -0.328730  5.282766e-27   
mutual_information                        -0

In [4]:
spearman_df = pd.read_csv("spearman_feature_selection.csv", index_col=0)

In [None]:
# 2. Add domain knowledge features
domain_features = [
    'Cos(day of Year)', 'Gage Height', 'Precipitation',
    'Sin(Day of Year)', 'Solar Radiation', 'Streamflow',
    'Wind Speed', 'month', 'season', 'time_diff_days_global'
] # Domain Knowledge feature can only be add if they are considering relevant

# correlation values + p-values
domain_corr = spearman_df.loc[spearman_df.index.intersection(domain_features)]
print("Domain Knowledge Feature Correlations:")
print(domain_corr)

# Merge Spearman-selected + domain knowledge
final_features = list(set(selected_features_spearman + domain_features))

# Keep essential columns
columns_to_keep = final_features + ['Date', 'E.coli']

# Create final dataset
selected_data = df[columns_to_keep]
selected_data.to_csv('selected_dataset.csv', index=False)

print("✅ Final dataset saved with Spearman + Domain Knowledge features.")



Domain Knowledge Feature Correlations:
                       Spearman Correlation       p-value  Abs_Spearman
Sin(Day of Year)                  -0.181939  5.281542e-09      0.181939
Cos(day of Year)                  -0.257158  8.571473e-17      0.257158
Streamflow                         0.192989  5.673724e-10      0.192989
Gage Height                       -0.241719  5.832673e-15      0.241719
Precipitation                      0.131885  2.498081e-05      0.131885
Solar Radiation                    0.097517  1.868348e-03      0.097517
Wind Speed                        -0.114801  2.472645e-04      0.114801
time_diff_days_global              0.081571  9.324890e-03      0.081571
month                              0.129486  3.508671e-05      0.129486
season                             0.169034  6.038089e-08      0.169034
✅ Final dataset saved with Spearman + Domain Knowledge features.


In [8]:
import pandas as pd
from IPython.display import display

# The dataset Spearman + Domain Knowlege
print(selected_data.columns.tolist())


['Imperviousness', 'mutual_information', 'Agricultural Area', 'Topographic Wetness Index', 'euclidean_nearest_neighbor_mn', 'Elevation', 'Wind Speed', 'Sin(Day of Year)', 'shannon_diversity_index', 'Gage Height', 'Water Temperature', 'Precipitation', 'relative_mutual_information', 'Slope', 'Study_Area', 'entropy', 'Streamflow', 'Solar Radiation', 'month', 'Total Area', 'Cos(day of Year)', 'season', 'time_diff_days_global', 'Relative Humidity', 'Turbidity', 'Upstream Tributary Streamflow', 'Date', 'E.coli']
