In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.utils import resample
from pandas.plotting import parallel_coordinates

# Load data
notebook_dir = os.path.abspath('')
csv_path = os.path.join(notebook_dir, "data", "WVS_Cross-National_Wave_7_csv_v6_0.csv")
df = pd.read_csv(csv_path, low_memory=False)

# Data cleaning
missing_percentage = df.isnull().sum() / len(df) * 100
columns_to_drop = missing_percentage[missing_percentage > 50].index
df = df.drop(columns=columns_to_drop)

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Scale numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Segmentation
relevant_columns = ['Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']
df_segmentation = df[relevant_columns].copy()  # Explicitly create a copy to avoid the warning

# Clustering
df_sample = resample(df_segmentation, n_samples=10000, random_state=42)
k_values = range(2, 11)
inertia = []
silhouette_scores = []

for k in k_values:
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42, n_init=1)
    kmeans.fit(df_sample)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(df_sample, kmeans.labels_))

optimal_k = 4
kmeans = MiniBatchKMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(df_segmentation)
df_segmentation['Cluster'] = kmeans.labels_

dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(df_segmentation)
df_segmentation['DBSCAN_Cluster'] = dbscan_labels

# Evaluation
kmeans_silhouette = silhouette_score(df_segmentation.drop(columns=['Cluster', 'DBSCAN_Cluster']), df_segmentation['Cluster'])
kmeans_db = davies_bouldin_score(df_segmentation.drop(columns=['Cluster', 'DBSCAN_Cluster']), df_segmentation['Cluster'])

if 'DBSCAN_Cluster' in df_segmentation.columns:
    dbscan_silhouette = silhouette_score(df_segmentation.drop(columns=['Cluster', 'DBSCAN_Cluster']), df_segmentation['DBSCAN_Cluster'])
    dbscan_db = davies_bouldin_score(df_segmentation.drop(columns=['Cluster', 'DBSCAN_Cluster']), df_segmentation['DBSCAN_Cluster'])

# Visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_segmentation.drop(columns=['Cluster', 'DBSCAN_Cluster']))
df_segmentation['PCA1'] = pca_result[:, 0]
df_segmentation['PCA2'] = pca_result[:, 1]

tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(df_segmentation.drop(columns=['Cluster', 'DBSCAN_Cluster']))
df_segmentation['TSNE1'] = tsne_result[:, 0]
df_segmentation['TSNE2'] = tsne_result[:, 1]

# Save results
df_segmentation.to_csv('clustered_wvs_data.csv', index=False)
cluster_summary = df_segmentation.groupby('Cluster').mean()
cluster_summary.to_csv('cluster_summary.csv')

# Plotting
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df_segmentation, palette='viridis')
plt.title('K-Means Clusters (PCA)')
plt.show()

plt.figure(figsize=(10, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='Cluster', data=df_segmentation, palette='viridis')
plt.title('K-Means Clusters (t-SNE)')
plt.show()

plt.figure(figsize=(12, 8))
parallel_coordinates(df_segmentation[['TSNE1', 'TSNE2', 'DBSCAN_Cluster', 'Q1', 'PCA1', 'Cluster']], 'Cluster', colormap='viridis')
plt.title('Parallel Coordinates Plot of Top Features Across Clusters')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(x='Cluster', hue='Cluster', data=df_segmentation, palette='viridis', legend=False)
plt.title('Distribution of Individuals Across Clusters')
plt.show()

# Sociological insights
sociological_insights = {
    0: {'description': 'Rural Poor', 'characteristics': {'Income': cluster_summary.loc[0, 'Q46'], 'Education Level': cluster_summary.loc[0, 'Q47'], 'Urban/Rural': 'Rural', 'Religiosity': 'High'}},
    1: {'description': 'Urban Middle Class', 'characteristics': {'Income': cluster_summary.loc[1, 'Q46'], 'Education Level': cluster_summary.loc[1, 'Q47'], 'Urban/Rural': 'Urban', 'Religiosity': 'Moderate'}},
    2: {'description': 'Elite', 'characteristics': {'Income': cluster_summary.loc[2, 'Q46'], 'Education Level': cluster_summary.loc[2, 'Q47'], 'Urban/Rural': 'Urban', 'Religiosity': 'Low'}}
}

for cluster_id, insights in sociological_insights.items():
    print(f"Cluster {cluster_id}: {insights['description']}")
    for key, value in insights['characteristics'].items():
        print(f" - {key}: {value}")
    print()

# Global benchmarks
global_benchmarks = pd.DataFrame({
    'Segment': ['Low Income', 'Middle Class', 'Upper Class'],
    'Avg_Income': [5000, 20000, 50000],
    'Avg_Education_Level': [2, 4, 6]
})

plt.figure(figsize=(10, 6))
sns.barplot(x='Segment', y='Avg_Income', data=global_benchmarks, color='blue', alpha=0.5)
sns.barplot(x=cluster_summary.index.astype(str), y=cluster_summary['Q46'], color='orange', alpha=0.7)
plt.title('Comparison of Income Levels: Kazakhstan Clusters vs Global Benchmarks')
plt.xlabel('Segment/Cluster')
plt.ylabel('Average Income')
plt.legend(['Global Benchmarks', 'Kazakhstan Clusters'])
plt.show()