In [None]:
import os
import numpy as np
import pandas as pd
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.clustering import TimeSeriesKMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tslearn.metrics import dtw

In [None]:
# Set font configuration for English display
plt.rcParams["font.family"] = ["DejaVu Sans", "Arial", "sans-serif"]
plt.rcParams['axes.unicode_minus'] = False  # Ensure minus sign displays correctly

# Data folder path
data_folder = r"D:\country_datatable\2003-2022"

In [None]:
def load_country_data(folder_path):
    """Load and process country time series data"""
    # Get all CSV files (sorted by year)
    csv_files = sorted(
        [f for f in os.listdir(folder_path) if f.endswith('.csv')],
        key=lambda x: int(x.split('.')[0])
    )

    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in folder {folder_path}")

    # Extract year information
    years = [int(f.split('.')[0]) for f in csv_files]
    print(f"Years found in data: {years}")

    # Read the first file to get country list and indicator names
    first_file = os.path.join(folder_path, csv_files[0])
    df_first = pd.read_csv(first_file)
    all_countries = df_first['COUNTRY'].unique()
    indicator_names = df_first.columns[2:].tolist()  # Columns from 3rd are indicators
    print(f"Total {len(all_countries)} countries/regions, {len(indicator_names)} indicators")

    # Build time series data for each country (country -> [time points, indicator values])
    country_data = {}
    country_iso_map = {}  # Store mapping between country and ISO code

    for country in all_countries:
        # Initialize country data (number of years x number of indicators)
        country_series = np.zeros((len(years), len(indicator_names)))
        country_series[:] = np.nan  # Initialize with NaN

        # Extract ISO code (assuming the same for all years)
        iso_code = df_first[df_first['COUNTRY'] == country]['ISO'].iloc[0]
        country_iso_map[country] = iso_code

        for i, csv_file in enumerate(csv_files):
            file_path = os.path.join(folder_path, csv_file)
            df = pd.read_csv(file_path)

            # Extract data for current country
            country_df = df[df['COUNTRY'] == country]

            if not country_df.empty:
                # Extract indicator values and store in corresponding year position
                indicators = country_df.iloc[0, 2:].values
                country_series[i] = indicators

        country_data[country] = country_series

    return country_data, years, indicator_names, country_iso_map


def preprocess_data(country_data):
    """Preprocess time series data by filling all missing values with 0"""
    processed_series = []
    countries = []

    for country, series in country_data.items():
        # Fill all missing values with 0
        filled_series = np.nan_to_num(series, nan=0.0)
        processed_series.append(filled_series)
        countries.append(country)

    print(f"Number of countries after processing: {len(countries)}")
    return np.array(processed_series), countries


def find_optimal_clusters(data):
    """Calculate silhouette scores for cluster numbers 2, 3, 4, 5, 6"""
    silhouette_scores = []
    k_values = [2, 3, 4, 5, 6]

    for k in k_values:
        kmeans = TimeSeriesKMeans(
            n_clusters=k,
            metric="dtw",
            max_iter=100,
            random_state=42
        )
        labels = kmeans.fit_predict(data)

        # Calculate DTW distance matrix (for silhouette score)
        distance_matrix = np.zeros((len(data), len(data)))

        # Use tslearn's dtw function to calculate distances directly
        for i in range(len(data)):
            for j in range(i + 1, len(data)):
                # Ensure correct data format
                dist = dtw(data[i], data[j])
                distance_matrix[i, j] = dist
                distance_matrix[j, i] = dist

        # Calculate silhouette score
        score = silhouette_score(distance_matrix, labels, metric="precomputed")
        silhouette_scores.append(score)
        print(f"Number of clusters k={k}, Silhouette Score: {score:.4f}")

    # Plot silhouette scores
    plt.figure(figsize=(10, 6))
    plt.plot([2, 3, 4, 5, 6], silhouette_scores, 'o-')
    plt.xlabel('Number of clusters k')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Scores for Different Cluster Numbers')
    plt.grid(True)
    plt.savefig('silhouette_scores.png')
    plt.close()

    return k_values, silhouette_scores


def perform_clustering(data, n_clusters):
    """Perform time series clustering"""
    # Standardize data
    scaler = TimeSeriesScalerMeanVariance()
    scaled_data = scaler.fit_transform(data)

    # Perform K-means clustering using DTW
    kmeans = TimeSeriesKMeans(
        n_clusters=n_clusters,
        metric="dtw",
        max_iter=100,
        random_state=42,
        n_jobs=-1  # Use all CPU cores to speed up computation
    )

    labels = kmeans.fit_predict(scaled_data)

    # Calculate silhouette score
    distance_matrix = np.zeros((len(scaled_data), len(scaled_data)))

    # Use tslearn's dtw function to calculate distances directly
    for i in range(len(scaled_data)):
        for j in range(i + 1, len(scaled_data)):
            dist = dtw(scaled_data[i], scaled_data[j])
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist

    silhouette_avg = silhouette_score(distance_matrix, labels, metric="precomputed")
    print(f"Final clustering results (k={n_clusters}): Silhouette Score = {silhouette_avg:.4f}")

    return kmeans, scaled_data, labels


def visualize_clusters(kmeans, data, labels, countries, years, indicator_names, country_iso_map, k):
    """Visualize clustering results and save clustering results to CSV"""
    n_clusters = len(kmeans.cluster_centers_)

    # Use t-SNE for dimensionality reduction to visualize high-dimensional clustering results
    n_samples, n_timesteps, n_features = data.shape
    data_2d = data.reshape(n_samples, n_timesteps * n_features)

    tsne = TSNE(n_components=2, random_state=42, perplexity=10)
    data_tsne = tsne.fit_transform(data_2d)

    # Plot clustering scatter plot
    plt.figure(figsize=(12, 10))
    for i in range(n_clusters):
        cluster_indices = np.where(labels == i)[0]
        plt.scatter(
            data_tsne[cluster_indices, 0],
            data_tsne[cluster_indices, 1],
            label=f'Cluster {i} ({len(cluster_indices)} countries/regions)',
            alpha=0.7, s=80
        )

        # Add labels for some example countries in each cluster
        for idx in cluster_indices[:3]:
            plt.annotate(
                countries[idx],
                (data_tsne[idx, 0], data_tsne[idx, 1]),
                fontsize=9,
                alpha=0.8
            )

    plt.title(f'Country Clustering Results Based on All Indicators (k={k}, t-SNE Dimensionality Reduction)')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'country_clusters_k{k}.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Visualize the trend changes of cluster center indicators
    plt.figure(figsize=(15, 5 * n_clusters))

    for i in range(n_clusters):
        plt.subplot(n_clusters, 1, i + 1)
        cluster_center = kmeans.cluster_centers_[i]

        # Select the 3 indicators with the most significant changes for visualization
        variances = np.var(cluster_center, axis=0)
        top_indices = np.argsort(variances)[-3:]

        for j in top_indices:
            plt.plot(years, cluster_center[:, j],
                     label=indicator_names[j], linewidth=2)

        plt.title(f'Key Indicator Trends for Cluster {i}')
        plt.xlabel('Year')
        plt.ylabel('Standardized Value')
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.savefig(f'cluster_trends_k{k}.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Save clustering results to CSV
    cluster_results = pd.DataFrame({
        'COUNTRY': countries,
        'ISO': [country_iso_map[country] for country in countries],
        'CLUSTER': labels
    })

    csv_path = f'country_clusters_k{k}.csv'
    cluster_results.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"Clustering results saved to {csv_path}")

    # Output clustering results table
    cluster_groups = {}
    for i in range(n_clusters):
        cluster_countries = cluster_results[cluster_results['CLUSTER'] == i]['COUNTRY'].tolist()
        cluster_groups[i] = cluster_countries

    with open(f'cluster_results_k{k}.txt', 'w', encoding='utf-8') as f:
        f.write(f"Clustering Results (k={k}):\n\n")
        for i, countries_list in cluster_groups.items():
            f.write(f"Cluster {i} ({len(countries_list)} countries/regions):\n")
            f.write(", ".join(countries_list) + "\n\n")

    print(f"Clustering details saved to cluster_results_k{k}.txt")
    print(f"Visualization charts saved to country_clusters_k{k}.png and cluster_trends_k{k}.png")


def main():
    try:
        # Load data
        print("Loading data...")
        country_data, years, indicator_names, country_iso_map = load_country_data(data_folder)

        # Preprocess data (fill missing values with 0)
        print("Preprocessing data...")
        processed_data, valid_countries = preprocess_data(country_data)
        print(f"Processed data shape: {processed_data.shape}")

        # Calculate silhouette scores for different cluster numbers
        print("Calculating silhouette scores for different cluster numbers...")
        k_values, silhouette_scores = find_optimal_clusters(processed_data)

        # Perform clustering and visualize results for each cluster number k
        for k in k_values:
            print(f"\n===== Starting clustering analysis (k={k}) =====")
            # Perform clustering
            kmeans, scaled_data, labels = perform_clustering(processed_data, k)

            # Visualize clustering results and save to CSV
            visualize_clusters(kmeans, scaled_data, labels, valid_countries, years, indicator_names, country_iso_map, k)

        print("\nAll clustering analyses completed!")
        print("Silhouette Score Summary:")
        for k, score in zip(k_values, silhouette_scores):
            print(f"k={k}: Silhouette Score = {score:.4f}")

    except Exception as e:
        print(f"Error during program execution: {e}")
        # Print detailed error stack trace for debugging
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()