In [None]:
!pip install pymannkendall
import pymannkendall as mk
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "../src")
from Nonstationary_MK import *

# Specifications

In [None]:
var = 'default'
name = ''
space_thres = 3 # Spatial threshold
time_thres = 5  # Temporal threshold
num_thres = 7 # Minimum number of points to form a cluster
cluster = f'st_cluster_{space_thres}_{time_thres}_{num_thres}'

save = True
min_data = 10

if var == 'default':
    var = 'returnPeriod_MSWEP_1d'
    name = '_MSWEP'

# Load Data

In [None]:
# Load the combined Processed_Claims.csv file
processed_claims_file = "../../no_percentile_filter/final_filtered_claims.csv"
processed_claims_df = pd.read_csv(processed_claims_file)

# Ensure that 'countyCode' is properly formatted as a 5-character string
processed_claims_df['countyCode'] = processed_claims_df['countyCode'].astype(int).astype(str)
processed_claims_df['countyCode'] = processed_claims_df['countyCode'].apply(lambda x: str(x).zfill(5))

# Ensure 'dateOfLoss' in dt format
processed_claims_df['dateOfLoss'] = pd.to_datetime(processed_claims_df['dateOfLoss'])  # Convert date column

# Convert 'dateOfLoss' to ordinal (number of days since a fixed point)
processed_claims_df['time'] = processed_claims_df['dateOfLoss'].map(pd.Timestamp.toordinal)

In [None]:
# Filter counties with at least min unique data points
claims = processed_claims_df.groupby('countyCode').filter(lambda g: g['time'].nunique() >= min_data and g[var].nunique() >= min_data)

In [None]:
# Unified results storage
results = {}

for county, group in claims.groupby('countyCode'):
    print(f"Processing county: {county} for {name}...")

    group = group.sort_values('time')  # Ensure sorted by time
    time_values = group['time'].values
    var_values = group[var].values

    # Skip counties with insufficient data
    if len(time_values) < 2 or len(np.unique(time_values)) < 2:
        print(f"Skipping county: {county} for {name} (insufficient data)")
        continue

    # Random sampling for large datasets
    if len(var_values) > 75000:
        print(f"County {county} has {len(var_values)} records. Using random sampling...")
        mk_results = random_sampling_mk(var_values)
    else:
        try:
            result = mk.original_test(var_values)
            print('mk complete')
            # Compute additional details
            sample_size = len(var_values)
            test_statistic = result.Tau  # Kendall's Tau correlation coefficient
            effect_size = abs(result.Tau)  # Effect size (absolute Tau value)
            confidence_interval = (result.slope - 1.96 * result.intercept, 
                                   result.slope + 1.96 * result.intercept)  # Approximate 95% CI
            dof = sample_size - 1  # Degrees of Freedom (approximation)

            mk_results = {
                "trend": result.trend,
                "p-value": result.p,
                "slope": result.slope,
                "significant": result.p < 0.05,
                "sample_size": sample_size,
                "test_statistic": test_statistic,
                "effect_size": effect_size,
                "confidence_interval_lower": confidence_interval[0],
                "confidence_interval_upper": confidence_interval[1],
                "degrees_of_freedom": dof
            }
            
        except ZeroDivisionError:
            print(f"Skipping county: {county} for {name} (Mann-Kendall Test failed due to zero-division)")
            continue
    
    # Append results to unified results dictionary
    if county not in results:
        results[county] = {}

    results[county]["trend"] = mk_results["trend"]
    results[county]["p-value"] = mk_results["p-value"]
    results[county]["slope"] = mk_results["slope"] #sen_slope_value
    results[county]["significant"] = mk_results["significant"]
    results[county]["sample_size"] = mk_results["sample_size"]
    results[county]["test_statistic"] = mk_results["test_statistic"]
    results[county]["effect_size"] = mk_results["effect_size"]

    print(f"Finished processing county: {county} for {name}")

# Convert unified results to DataFrame
results_df = pd.DataFrame.from_dict(results, orient="index").reset_index()
results_df.rename(columns={"index": "countyCode"}, inplace=True)

In [None]:
if save:
    results_df.to_csv("Nonstationary_MK/claim_trends"+name+".csv")

# Load Clusters

In [None]:
claims_clusters = pd.read_csv('Clusters/no_percentile_filter/clustered_claims_sensitivity.csv')
claims_clusters['countyCode'] = claims_clusters['countyCode'].astype(int).astype(str)
claims_clusters['countyCode'] = claims_clusters['countyCode'].apply(lambda x: str(x).zfill(5))

In [None]:
# Ensure `dateOfLoss` is in datetime format
claims_clusters['dateOfLoss'] = pd.to_datetime(claims_clusters['dateOfLoss'])

# Group by cluster to compute median date and cluster size
cluster_summary = (
    claims_clusters.groupby(cluster)
    .agg(
        median_dateOfLoss=('dateOfLoss', 'median'),
        mean_returnPeriod=(var, 'mean'),
        cluster_size=('dateOfLoss', 'size')
    )
    .reset_index()
)

In [None]:
# Convert median dateOfLoss to ordinal format for numeric trend analysis
cluster_summary['ordinal_date'] = cluster_summary['median_dateOfLoss'].map(pd.Timestamp.toordinal)

# Sort data by median date for time series analysis
cluster_summary = cluster_summary.sort_values('ordinal_date')

# Extract time series
dates = cluster_summary['ordinal_date']
sizes = cluster_summary['cluster_size']
returns = cluster_summary['mean_returnPeriod']

# Perform the Mann-Kendall Test
mk_result_dates = mk.original_test(dates)

# Compute additional details
sample_size = len(dates)
test_statistic = mk_result_dates.Tau  # Kendall's Tau
effect_size = abs(mk_result_dates.Tau)  # Effect size in Kendall's Tau (absolute value)
confidence_interval = (mk_result_dates.slope - 1.96 * mk_result_dates.intercept, 
                       mk_result_dates.slope + 1.96 * mk_result_dates.intercept)  # Approximate 95% CI
dof = sample_size - 1  # Degrees of Freedom (approximation)

# Print results
print("Mann-Kendall Test for Median Date:")
print(f"Trend: {mk_result_dates.trend}")
print(f"P-value: {mk_result_dates.p}")
print(f"Significance: {'Significant' if mk_result_dates.p < 0.05 else 'Not Significant'}")
print(f"Sen's Slope for Median Date: {mk_result_dates.slope}")
print(f"Sample Size: {sample_size}")
print(f"Test Statistic (Tau): {test_statistic}")
print(f"Effect Size: {effect_size}")
print(f"Confidence Interval (Approximate 95% CI): {confidence_interval}")
print(f"Degrees of Freedom (DOF): {dof}")

In [None]:
# Perform the Mann-Kendall Test
mk_result_sizes = mk.original_test(sizes)

# Compute additional details
sample_size = len(sizes)
test_statistic = mk_result_sizes.Tau  # Kendall's Tau
effect_size = abs(mk_result_sizes.Tau)  # Effect size in Kendall's Tau (absolute value)
confidence_interval = (mk_result_sizes.slope - 1.96 * mk_result_sizes.intercept, 
                       mk_result_sizes.slope + 1.96 * mk_result_sizes.intercept)  # Approximate 95% CI
dof = sample_size - 1  # Degrees of Freedom (approximation)

# Print results
print("Mann-Kendall Test for Cluster Size:")
print(f"Trend: {mk_result_sizes.trend}")
print(f"P-value: {mk_result_sizes.p}")
print(f"Significance: {'Significant' if mk_result_sizes.p < 0.05 else 'Not Significant'}")
print(f"Sen's Slope for Cluster Size: {mk_result_sizes.slope}")
print(f"Sample Size: {sample_size}")
print(f"Test Statistic (Tau): {test_statistic}")
print(f"Effect Size: {effect_size}")
print(f"Confidence Interval (Approximate 95% CI): {confidence_interval}")
print(f"Degrees of Freedom (DOF): {dof}")

In [None]:
# Perform the Mann-Kendall Test
mk_result_return = mk.original_test(returns)

# Compute additional details
sample_size = len(returns)
test_statistic = mk_result_return.Tau  # Kendall's Tau correlation coefficient
effect_size = abs(mk_result_return.Tau)  # Effect size in Kendall's Tau (absolute value)
confidence_interval = (mk_result_return.slope - 1.96 * mk_result_return.intercept, 
                       mk_result_return.slope + 1.96 * mk_result_return.intercept)  # Approximate 95% CI
dof = sample_size - 1  # Degrees of Freedom (approximation)

# Print results
print("Mann-Kendall Test for Return Period:")
print(f"Trend: {mk_result_return.trend}")
print(f"P-value: {mk_result_return.p}")
print(f"Significance: {'Significant' if mk_result_return.p < 0.05 else 'Not Significant'}")
print(f"Sen's Slope for Return Period: {mk_result_return.slope}")
print(f"Sample Size: {sample_size}")
print(f"Test Statistic (Tau): {test_statistic}")
print(f"Effect Size: {effect_size}")
print(f"Confidence Interval (Approximate 95% CI): {confidence_interval}")
print(f"Degrees of Freedom (DOF): {dof}")

In [None]:
# Compile Results
trend_results = {
    "median_date_trend": mk_result_dates.trend,
    "median_date_p-value": mk_result_dates.p,
    "median_date_slope": mk_result_dates.slope,
    "median_date_significant": mk_result_dates.p < 0.05,
    "cluster_size_trend": mk_result_sizes.trend,
    "cluster_size_p-value": mk_result_sizes.p,
    "cluster_size_slope": mk_result_sizes.slope,
    "cluster_size_significant": mk_result_sizes.p < 0.05
}

# Convert results to DataFrame
results_df = pd.DataFrame([trend_results])