In [22]:
import mlflow
import pandas as pd

mlflow.set_experiment("Country_Clustering")

# Ensure any previously active run is ended
mlflow.end_run()

with mlflow.start_run(run_name="Load_Dataset"):
    df = pd.read_csv("Country-data.csv")

    mlflow.log_param("rows", df.shape[0])
    mlflow.log_param("columns", df.shape[1])

    # Save and log the dataset
    df.to_csv("country_data_logged.csv", index=False)
    mlflow.log_artifact("country_data_logged.csv")



In [None]:
import pandas as pd
import mlflow

# Load dataset
df = pd.read_csv("Country-data.csv")

# Start MLflow run
mlflow.set_experiment("Country_Clustering")

with mlflow.start_run(run_name="Data_Exploration"):

    # ✅ Log simple params
    mlflow.log_param("rows", df.shape[0])
    mlflow.log_param("columns", df.shape[1])
    mlflow.log_param("missing_values", df.isnull().sum().sum())
    mlflow.log_param("duplicates", df.duplicated().sum())

    # ✅ Save df.info() to a text file
    with open("df_info.txt", "w") as f:
        df.info(buf=f)

    # ✅ Save df.describe() to a CSV file
    df.describe().to_csv("df_describe.csv")

    # ✅ Log files as artifacts
    mlflow.log_artifact("df_info.txt")
    mlflow.log_artifact("df_describe.csv")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow

mlflow.set_experiment("Country_Clustering")

with mlflow.start_run(run_name="EDA_Plots"):

    top_gdpp = df[['country','gdpp']].sort_values(by='gdpp', ascending=False).head(10)
    plt.figure(figsize=(10,6))
    plt.barh(top_gdpp['country'], top_gdpp['gdpp'], color='teal')
    plt.gca().invert_yaxis()
    plt.title("Top 10 Countries by GDP per Capita")
    plt.xlabel("GDP per Capita")
    plt.savefig("top_gdpp.png")  
    mlflow.log_artifact("top_gdpp.png")  
    plt.close()


    top_income = df[['country','income']].sort_values(by='income', ascending=False).head(10)
    plt.figure(figsize=(10,6))
    plt.barh(top_income['country'], top_income['income'], color='orange')
    plt.gca().invert_yaxis()
    plt.title("Top 10 Countries by Income")
    plt.xlabel("Income")
    plt.savefig("top_income.png")
    mlflow.log_artifact("top_income.png")
    plt.close()

    low_life = df[['country','life_expec']].sort_values(by='life_expec').head(10)
    plt.figure(figsize=(10,6))
    plt.barh(low_life['country'], low_life['life_expec'], color='red')
    plt.title("Bottom 10 Countries by Life Expectancy")
    plt.xlabel("Life Expectancy")
    plt.savefig("low_life.png")
    mlflow.log_artifact("low_life.png")
    plt.close()


    top_child_mort = df[['country','child_mort']].sort_values(by='child_mort', ascending=False).head(10)
    plt.figure(figsize=(10,6))
    plt.barh(top_child_mort['country'], top_child_mort['child_mort'], color='purple')
    plt.title("Top 10 Countries by Child Mortality")
    plt.xlabel("Child Mortality")
    plt.savefig("top_child_mort.png")
    mlflow.log_artifact("top_child_mort.png")
    plt.close()


In [28]:
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow

with mlflow.start_run(run_name="Correlation_Heatmap"):

    num_df = df.select_dtypes(include=['float64', 'int64'])

    plt.figure(figsize=(8,6))
    sns.heatmap(num_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap (Numeric Columns)")

    heatmap_file = "correlation_heatmap.png"
    plt.savefig(heatmap_file)
    plt.close()  

    mlflow.log_artifact(heatmap_file)


In [None]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

mlflow.set_experiment("Country_Clustering")

# End any active run to avoid MLflow active run errors
mlflow.end_run()

with mlflow.start_run(run_name="Skewness_Analysis"):

    # Load dataset
    df = pd.read_csv("Country-data.csv")
    mlflow.log_param("rows", df.shape[0])
    mlflow.log_param("columns", df.shape[1])

    # Select numeric columns
    numeric_cols = df.select_dtypes(include=['float64','int64'])

    # Compute skewness before transformation
    skewness_before = numeric_cols.skew().sort_values(ascending=False)
    print("Skewness before log transform:\n", skewness_before)
    
    # Log skewness as metrics (optional, numeric only)
    for col, val in skewness_before.items():
        mlflow.log_metric(f"{col}_skew_before", val)

    # Columns to log-transform
    skewed_cols = ['inflation','exports','income','gdpp','imports','child_mort']
    df[skewed_cols] = df[skewed_cols].apply(lambda x: np.log1p(x))

    # Compute skewness after transformation
    skewness_after = df[skewed_cols].skew()
    print("\nSkewness after log transform:\n", skewness_after)

    # Log skewness after transformation
    for col, val in skewness_after.items():
        mlflow.log_metric(f"{col}_skew_after", val)

    # Plot histograms of numeric columns
    num_cols = df.select_dtypes(include=['float64','int64']).columns
    plt.figure(figsize=(15, 12))
    for i, col in enumerate(num_cols, 1):
        plt.subplot(4, 4, i)  # 4x4 grid for 13 columns
        sns.histplot(df[col], kde=True, bins=30, color="lightgreen")
        plt.title(f"{col}\nSkew={df[col].skew():.2f}")

    plt.tight_layout()
    
    # Save histogram figure and log to MLflow
    hist_file = "histograms.png"
    plt.savefig(hist_file)
    plt.close()
    mlflow.log_artifact(hist_file)



Skewness before log transform:
 inflation     5.154049
exports       2.445824
income        2.231480
gdpp          2.218051
imports       1.905276
child_mort    1.450774
total_fer     0.967092
health        0.705746
life_expec   -0.970996
dtype: float64


  result = getattr(ufunc, method)(*inputs, **kwargs)



Skewness after log transform:
 inflation    -1.300086
exports      -1.088961
income       -0.235823
gdpp          0.006548
imports      -1.822794
child_mort    0.066160
dtype: float64
✅ Skewness analysis and histograms logged to MLflow


In [41]:
from sklearn.preprocessing import StandardScaler
mlflow.end_run()

with mlflow.start_run(run_name="Scaling_Data"):

    # Scale numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[numeric_cols])
    df_scaled = pd.DataFrame(scaled_data, columns=numeric_cols)

    # Save and log scaled dataset
    scaled_file = "new_scaled.csv"
    df_scaled.to_csv(scaled_file, index=False)
    mlflow.log_artifact(scaled_file)


In [61]:
import mlflow
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="PCA_Explained_Variance"):

    # Fit PCA on scaled data
    pca = PCA()
    pca.fit(scaled_data)
    cum_var = np.cumsum(pca.explained_variance_ratio_)

    # Plot cumulative explained variance
    plt.figure(figsize=(10,6))
    plt.plot(range(1, len(cum_var)+1), cum_var, marker='o', linestyle='-')
    plt.axhline(y=0.9, color='r', linestyle='--', label='90% Variance Threshold')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA - Explained Variance vs Components')
    plt.legend()
    plt.grid(True)

    # Save plot and log as MLflow artifact
    pca_plot_file = "pca_cumulative_variance.png"
    plt.savefig(pca_plot_file)
    plt.close()
    mlflow.log_artifact(pca_plot_file)


In [62]:
import mlflow
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="PCA_5_Loadings"):

    # Perform PCA with 5 components
    pca5 = PCA(n_components=5)
    X_pca5 = pca5.fit_transform(scaled_data)

    # Create loadings DataFrame
    loadings = pd.DataFrame(
        pca5.components_.T,
        columns=[f'PC{i+1}' for i in range(5)],
        index=df.drop(columns=['country']).columns
    )

    # Print loadings
    print(loadings)

    # Save loadings as CSV and log as artifact
    loadings_file = "pca5_loadings.csv"
    loadings.to_csv(loadings_file, index=True)
    mlflow.log_artifact(loadings_file)


                 PC1       PC2       PC3       PC4       PC5
child_mort  0.436729  0.079424 -0.042111 -0.011543  0.172663
exports    -0.244853  0.590777 -0.282626  0.112337  0.264484
health     -0.144216 -0.070811  0.733609  0.611466  0.066185
imports    -0.093832  0.751405  0.144931  0.093922 -0.307174
income     -0.422480 -0.097914 -0.221052  0.086442  0.328513
inflation   0.212637 -0.117902 -0.541510  0.755003 -0.256602
life_expec -0.402127 -0.174776 -0.073659 -0.027508 -0.275990
total_fer   0.392296  0.104261  0.053114  0.112371  0.603032
gdpp       -0.424742 -0.100723 -0.093904  0.116830  0.433492


🔹 Cluster 1 → Developed, High GDP & Health

High income, GDP, life expectancy

Low child mortality, low fertility

Example: USA, Germany, Japan

🔹 Cluster 2 → Poor, High Mortality & Fertility

Low GDP, low income, low life expectancy

High child mortality, high fertility

Example: Sub-Saharan African nations

🔹 Cluster 3 → Trade-Oriented Economies

High imports & exports (big contribution from PC2)

GDP may vary, but trade dominates economy

Example: Singapore, UAE, Netherlands

🔹 Cluster 4 → Middle-Income / Emerging Economies

Moderate GDP & income

Improving life expectancy

Still some child mortality & fertility issues

Example: India, Brazil, South Africa

🔹 Cluster 5 → High Inflation / Economic Instability

Strong loading from PC3 & PC5 (inflation factor)

Economy unstable → despite income, poor health balance

Example: Argentina, Venezuela

In [63]:
# Assuming you're still inside the MLflow run
num_components = X_pca5.shape[1]
print("Number of PCA components used:", num_components)



Number of PCA components used: 5


In [64]:
# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="PCA_Top_Features"):

    # Perform PCA with 5 components (if not already done)
    from sklearn.decomposition import PCA
    import pandas as pd
    import numpy as np

    pca5 = PCA(n_components=5)
    X_pca5 = pca5.fit_transform(scaled_data)

    # Create loadings dataframe
    loadings = pd.DataFrame(
        pca5.components_.T,
        columns=[f'PC{i+1}' for i in range(5)],
        index=df.drop(columns=['country']).columns
    )

    # Function to get top features per PC
    def top_features_per_pc(loadings, n=3):
        top_features_dict = {}
        for pc in loadings.columns:
            top_feats = loadings[pc].abs().sort_values(ascending=False).head(n).index.tolist()
            top_features_dict[pc] = top_feats
            print(f"{pc}: {top_feats}")
        return top_features_dict

    # Show and log top 3 features for each PC
    top_feats_dict = top_features_per_pc(loadings, n=3)

    # Save top features as CSV and log as artifact
    top_feats_df = pd.DataFrame(top_feats_dict)
    top_feats_file = "pca_top_features.csv"
    top_feats_df.to_csv(top_feats_file, index=False)
    mlflow.log_artifact(top_feats_file)


PC1: ['child_mort', 'gdpp', 'income']
PC2: ['imports', 'exports', 'life_expec']
PC3: ['health', 'inflation', 'exports']
PC4: ['inflation', 'health', 'gdpp']
PC5: ['total_fer', 'gdpp', 'income']


In [65]:
import mlflow
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="KMeans_Elbow_Plot"):

    # Compute SSE for different k
    sse = []
    K = range(2, 11)  # test clusters from 2 to 10
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_pca5)
        sse.append(kmeans.inertia_)
        # Optionally log SSE for each k
        mlflow.log_metric(f"sse_k_{k}", kmeans.inertia_)

    # Plot elbow curve
    plt.figure(figsize=(8,5))
    plt.plot(K, sse, 'o-', color='blue')
    plt.xlabel("Number of clusters (k)")
    plt.ylabel("SSE (Inertia)")
    plt.title("Elbow Method for Optimal k")
    plt.grid(True)

    # Save and log plot as artifact
    elbow_plot_file = "kmeans_elbow_plot.png"
    plt.savefig(elbow_plot_file)
    plt.close()
    mlflow.log_artifact(elbow_plot_file)


In [67]:
import mlflow
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="KDistance_Plot"):

    # Fit NearestNeighbors
    neigh = NearestNeighbors(n_neighbors=5)
    nbrs = neigh.fit(scaled_data)
    distances, indices = nbrs.kneighbors(scaled_data)

    # Sort distances for 4th neighbor (k-distance)
    distances = np.sort(distances[:, 4])

    # Plot k-distance graph
    plt.figure(figsize=(8,5))
    plt.plot(distances, marker='o', linestyle='-')
    plt.xlabel("Points sorted by distance")
    plt.ylabel("k-distance")
    plt.title("k-distance Graph for DBSCAN")
    plt.grid(True)

    # Save and log plot as artifact
    kdist_plot_file = "k_distance_plot.png"
    plt.savefig(kdist_plot_file)
    plt.close()
    mlflow.log_artifact(kdist_plot_file)


In [68]:
import mlflow
from sklearn.cluster import DBSCAN
import numpy as np

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="DBSCAN_Cluster_Counts"):

    eps_values = [0.5, 1.0, 2.0, 3.0, 5.0]

    for eps in eps_values:
        db = DBSCAN(eps=eps, min_samples=5).fit(scaled_data)
        labels = db.labels_
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

        # Print for reference
        print(f"eps={eps} → clusters={n_clusters}")

        # Log number of clusters as MLflow metric
        mlflow.log_metric(f"dbscan_clusters_eps_{eps}", n_clusters)


eps=0.5 → clusters=0
eps=1.0 → clusters=2
eps=2.0 → clusters=1
eps=3.0 → clusters=1
eps=5.0 → clusters=1


In [69]:
import mlflow
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="Clustering_Comparison"):

    # --- Define clustering algorithms ---
    kmeans = KMeans(n_clusters=5, random_state=42)
    hc = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
    dbscan = DBSCAN(eps=1.0, min_samples=5)

    # --- Fit & Predict ---
    labels = {
        "KMeans": kmeans.fit_predict(scaled_data),
        "Hierarchical": hc.fit_predict(scaled_data),
        "DBSCAN": dbscan.fit_predict(scaled_data),
    }

    # --- Reduce dimensions for visualization ---
    pca = PCA(n_components=2, random_state=42)
    data_2d = pca.fit_transform(scaled_data)

    # --- Plot clusters for each algorithm ---
    for algo_name, algo_labels in labels.items():
        plt.figure(figsize=(8,6))
        unique_labels = set(algo_labels)
        for lbl in unique_labels:
            cluster_points = data_2d[algo_labels == lbl]
            plt.scatter(cluster_points[:,0], cluster_points[:,1], label=f'Cluster {lbl}' if lbl != -1 else 'Noise', s=50)
        plt.title(f"{algo_name} Clustering (2D PCA)")
        plt.xlabel("PC1")
        plt.ylabel("PC2")
        plt.legend()
        plt.grid(True)

        # Save and log each cluster plot as MLflow artifact
        plot_file = f"{algo_name}_2D_clusters.png"
        plt.savefig(plot_file)
        plt.close()
        mlflow.log_artifact(plot_file)


In [70]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score, davies_bouldin_score

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="Clustering_Comparison"):

    # --- Scale numeric data and handle NaNs ---
    numeric_cols = df.drop(columns=['country']).select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[numeric_cols])

    # Replace NaNs with column mean
    col_means = np.nanmean(scaled_data, axis=0)
    inds = np.where(np.isnan(scaled_data))
    scaled_data[inds] = np.take(col_means, inds[1])

    # --- KMeans ---
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    kmeans_labels = kmeans.fit_predict(scaled_data)
    kmeans_sil = silhouette_score(scaled_data, kmeans_labels)
    kmeans_dbi = davies_bouldin_score(scaled_data, kmeans_labels)
    kmeans_clusters = len(set(kmeans_labels))
    
    mlflow.log_metric("KMeans_silhouette", kmeans_sil)
    mlflow.log_metric("KMeans_davies_bouldin", kmeans_dbi)

    # --- Hierarchical ---
    hc = AgglomerativeClustering(n_clusters=5, metric="euclidean", linkage="ward")
    hc_labels = hc.fit_predict(scaled_data)
    hc_sil = silhouette_score(scaled_data, hc_labels)
    hc_dbi = davies_bouldin_score(scaled_data, hc_labels)
    hc_clusters = len(set(hc_labels))
    hc_ari = adjusted_rand_score(kmeans_labels, hc_labels)

    mlflow.log_metric("Hierarchical_silhouette", hc_sil)
    mlflow.log_metric("Hierarchical_davies_bouldin", hc_dbi)
    mlflow.log_metric("Hierarchical_ARI_vs_KMeans", hc_ari)

    # --- DBSCAN ---
    dbscan = DBSCAN(eps=1.0, min_samples=5)
    db_labels = dbscan.fit_predict(scaled_data)
    db_clusters = len(set(db_labels)) - (1 if -1 in db_labels else 0)
    
    # Silhouette score cannot be computed if only 1 cluster or all noise
    if db_clusters > 1:
        db_sil = silhouette_score(scaled_data, db_labels)
        db_dbi = davies_bouldin_score(scaled_data, db_labels)
    else:
        db_sil = np.nan
        db_dbi = np.nan

    mlflow.log_metric("DBSCAN_silhouette", db_sil if not np.isnan(db_sil) else -1)
    mlflow.log_metric("DBSCAN_davies_bouldin", db_dbi if not np.isnan(db_dbi) else -1)

    # --- Compile results ---
    results = pd.DataFrame({
        "Method": ["KMeans", "Hierarchical", "DBSCAN"],
        "Clusters Found": [kmeans_clusters, hc_clusters, db_clusters],
        "Silhouette Score": [round(kmeans_sil, 3), round(hc_sil, 3), round(db_sil, 3) if not np.isnan(db_sil) else None],
        "Davies-Bouldin": [round(kmeans_dbi, 3), round(hc_dbi, 3), round(db_dbi, 3) if not np.isnan(db_dbi) else None],
        "ARI_vs_KMeans": [1.0, round(hc_ari, 3), None]
    })

    # Log results as CSV artifact
    results_file = "clustering_results.csv"
    results.to_csv(results_file, index=False)
    mlflow.log_artifact(results_file)

    print(results)


         Method  Clusters Found  Silhouette Score  Davies-Bouldin  \
0        KMeans               5             0.232           1.133   
1  Hierarchical               5             0.199           1.225   
2        DBSCAN               2            -0.014           1.461   

   ARI_vs_KMeans  
0          1.000  
1          0.491  
2            NaN  


Rich & healthy

Poor & unhealthy

Trade-heavy

Emerging middle

Inflation/unstable