In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
import json
import os

plt.rcParams['figure.figsize'] = (14, 8)
np.random.seed(42)

In [None]:
data = {}
for i in range(1, 4):
    df = pd.read_csv(f'data/S07-hw-dataset-0{i}.csv')
    print(f'\nDataset {i}:')
    print(f'  Shape: {df.shape}')
    print(f'  Head:\n{df.head()}')
    print(f'  Info:')
    df.info()
    print(f'  Describe:\n{df.describe()}')
    print(f'  Missing values:\n{df.isnull().sum()}')
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'sample_id' in df.columns:
        numeric_cols.remove('sample_id') if 'sample_id' in numeric_cols else None
    
    data[f'ds{i}'] = {'df': df, 'numeric_cols': numeric_cols}
    print(f'  Numeric cols: {numeric_cols}')

In [None]:
preprocessed = {}

for ds_name, ds_data in data.items():
    df = ds_data['df']
    numeric_cols = ds_data['numeric_cols']
    
    sample_id = df['sample_id'] if 'sample_id' in df.columns else df.index
    X = df[numeric_cols].copy()
    
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    X_processed = pd.DataFrame(
        pipeline.fit_transform(X),
        columns=numeric_cols
    )
    
    preprocessed[ds_name] = {
        'X': X_processed,
        'sample_id': sample_id,
        'pipeline': pipeline
    }
    
    print(f'{ds_name}: processed {X_processed.shape}')

In [None]:
kmeans_results = {}

for ds_name, prep_data in preprocessed.items():
    X = prep_data['X']
    print(f'\n{ds_name} - KMeans:')
    
    k_range = range(2, 21)
    sil_scores = []
    db_scores = []
    models = []
    
    for k in k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(X)
        sil = silhouette_score(X, labels)
        db = davies_bouldin_score(X, labels)
        
        sil_scores.append(sil)
        db_scores.append(db)
        models.append(km)
        print(f'  k={k:2d}: sil={sil:.4f}, db={db:.4f}')
    
    best_k_idx = np.argmax(sil_scores)
    best_k = list(k_range)[best_k_idx]
    
    kmeans_results[ds_name] = {
        'best_k': best_k,
        'best_model': models[best_k_idx],
        'sil_scores': sil_scores,
        'db_scores': db_scores,
        'k_range': list(k_range),
        'models': models
    }
    
    print(f'  Best k: {best_k}')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle('KMeans: Silhouette vs k', fontsize=14, fontweight='bold')

for idx, (ds_name, res) in enumerate(kmeans_results.items()):
    ax = axes[idx]
    k_range = res['k_range']
    sil_scores = res['sil_scores']
    best_k = res['best_k']
    
    ax.plot(k_range, sil_scores, 'b-o', linewidth=2, markersize=6)
    ax.axvline(best_k, color='r', linestyle='--', linewidth=2, label=f'best k={best_k}')
    ax.scatter([best_k], [sil_scores[best_k-2]], color='r', s=200, zorder=5)
    
    ax.set_xlabel('k')
    ax.set_ylabel('Silhouette Score')
    ax.set_title(ds_name)
    ax.grid(True, alpha=0.3)
    ax.legend()

plt.tight_layout()
os.makedirs('artifacts/figures', exist_ok=True)
plt.savefig('artifacts/figures/kmeans_silhouette_analysis.png', dpi=100)
plt.show()

print('сохранено: kmeans_silhouette_analysis.png')

In [None]:
dbscan_results = {}

for ds_name, prep_data in preprocessed.items():
    X = prep_data['X']
    print(f'\n{ds_name} - DBSCAN:')
    
    n_samples = X.shape[0]
    eps_range = np.linspace(0.2, 2.0, 10)
    min_samples_values = [3, 5, 10]
    
    best_sil = -1
    best_params = {}
    best_model = None
    all_results = []
    
    for min_samples in min_samples_values:
        for eps in eps_range:
            db = DBSCAN(eps=eps, min_samples=min_samples)
            labels = db.fit_predict(X)
            
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)
            
            sil = -1
            if n_clusters > 0 and n_noise < len(X):
                try:
                    sil = silhouette_score(X, labels, sample_size=min(500, X.shape[0]))
                except:
                    sil = -1
            
            all_results.append({
                'eps': eps,
                'min_samples': min_samples,
                'n_clusters': n_clusters,
                'n_noise': n_noise,
                'sil': sil,
                'model': db
            })
            
            if sil > best_sil and n_clusters > 0:
                best_sil = sil
                best_params = {'eps': eps, 'min_samples': min_samples}
                best_model = db
    
    valid_results = [r for r in all_results if r['sil'] > 0]
    if valid_results:
        best_result = max(valid_results, key=lambda x: x['sil'])
    else:
        best_result = max(all_results, key=lambda x: x['n_clusters'])
    
    dbscan_results[ds_name] = {
        'best_params': {'eps': best_result['eps'], 'min_samples': best_result['min_samples']},
        'best_model': best_result['model'],
        'n_clusters': best_result['n_clusters'],
        'n_noise': best_result['n_noise'],
        'sil': best_result['sil'],
        'all_results': all_results
    }
    
    print(f'  Лучшие параметры: eps={best_result["eps"]:.2f}, min_samples={best_result["min_samples"]}')
    print(f'  Кластеры: {best_result["n_clusters"]}, Шум: {best_result["n_noise"]}')

In [None]:
metrics = {}

for ds_name, prep_data in preprocessed.items():
    X = prep_data['X']
    print(f'\n{ds_name} - Metrics:')
    
    km_model = kmeans_results[ds_name]['best_model']
    km_labels = km_model.fit_predict(X)
    
    km_sil = silhouette_score(X, km_labels)
    km_db = davies_bouldin_score(X, km_labels)
    km_ch = calinski_harabasz_score(X, km_labels)
    
    metrics[ds_name] = {
        'KMeans': {
            'silhouette': km_sil,
            'davies_bouldin': km_db,
            'calinski_harabasz': km_ch,
            'n_clusters': kmeans_results[ds_name]['best_k']
        }
    }
    
    print(f'  KMeans: sil={km_sil:.4f}, db={km_db:.4f}, ch={km_ch:.1f}')
    
    db_model = dbscan_results[ds_name]['best_model']
    db_labels = db_model.fit_predict(X)
    
    n_clusters_db = len(set(db_labels)) - (1 if -1 in db_labels else 0)
    n_noise = list(db_labels).count(-1)
    noise_ratio = n_noise / len(X)
    
    db_sil = -1
    db_db = -1
    db_ch = -1
    
    if n_clusters_db > 0:
        try:
            db_sil = silhouette_score(X, db_labels, sample_size=min(500, X.shape[0]))
            db_db = davies_bouldin_score(X, db_labels)
            db_ch = calinski_harabasz_score(X, db_labels)
        except:
            pass
    
    metrics[ds_name]['DBSCAN'] = {
        'silhouette': db_sil,
        'davies_bouldin': db_db,
        'calinski_harabasz': db_ch,
        'n_clusters': n_clusters_db,
        'noise_points': n_noise,
        'noise_ratio': noise_ratio
    }
    
    print(f'  DBSCAN: sil={db_sil:.4f}, db={db_db:.4f}, ch={db_ch:.1f}')
    print(f'  Noise: {n_noise} ({noise_ratio*100:.1f}%)')

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
fig.suptitle('PCA 2D Clustering Visualization', fontsize=14, fontweight='bold')

ds_names = list(preprocessed.keys())

for row, ds_name in enumerate(ds_names):
    X = preprocessed[ds_name]['X']
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)
    
    ax = axes[row, 0]
    km_labels = kmeans_results[ds_name]['best_model'].predict(X)
    scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=km_labels, cmap='viridis', s=30, alpha=0.6)
    ax.set_title(f'{ds_name} - KMeans (k={kmeans_results[ds_name]["best_k"]})')
    ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
    ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
    plt.colorbar(scatter, ax=ax)
    
    ax = axes[row, 1]
    db_labels = dbscan_results[ds_name]['best_model'].fit_predict(X)
    noise_mask = db_labels == -1
    
    scatter = ax.scatter(X_pca[~noise_mask, 0], X_pca[~noise_mask, 1], 
                        c=db_labels[~noise_mask], cmap='viridis', s=30, alpha=0.6)
    ax.scatter(X_pca[noise_mask, 0], X_pca[noise_mask, 1], 
              marker='x', c='red', s=50, label='Noise')
    
    ax.set_title(f'{ds_name} - DBSCAN (n_clusters={dbscan_results[ds_name]["n_clusters"]})')
    ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
    ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
    if noise_mask.sum() > 0:
        ax.legend()
    plt.colorbar(scatter, ax=ax)

plt.tight_layout()
plt.savefig('artifacts/figures/pca_clustering_2d.png', dpi=100)
plt.show()

print('сохранено: pca_clustering_2d.png')

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
fig.suptitle('DBSCAN Parameters Analysis', fontsize=14, fontweight='bold')

ds_names = list(preprocessed.keys())

for row, ds_name in enumerate(ds_names):
    all_results = dbscan_results[ds_name]['all_results']
    best_eps = dbscan_results[ds_name]['best_params']['eps']
    best_min_samples = dbscan_results[ds_name]['best_params']['min_samples']
    
    eps_vals = sorted(set(r['eps'] for r in all_results))
    
    ax = axes[row, 0]
    for min_samples in [3, 5, 10]:
        results = [r for r in all_results if r['min_samples'] == min_samples]
        eps_plot = [r['eps'] for r in results]
        n_clusters = [r['n_clusters'] for r in results]
        ax.plot(eps_plot, n_clusters, 'o-', label=f'min_samples={min_samples}', linewidth=2)
    
    ax.scatter([best_eps], [dbscan_results[ds_name]['n_clusters']], 
              s=200, marker='*', c='red', edgecolors='black', linewidth=1.5, zorder=5)
    ax.set_xlabel('eps')
    ax.set_ylabel('Number of clusters')
    ax.set_title(f'{ds_name} - Clusters vs eps')
    ax.legend()
    ax.grid(alpha=0.3)
    
    ax = axes[row, 1]
    for min_samples in [3, 5, 10]:
        results = [r for r in all_results if r['min_samples'] == min_samples]
        eps_plot = [r['eps'] for r in results]
        noise_ratio = [(r['n_noise'] / len(preprocessed[ds_name]['X'])) * 100 for r in results]
        ax.plot(eps_plot, noise_ratio, 's-', label=f'min_samples={min_samples}', linewidth=2)
    
    best_noise_ratio = (dbscan_results[ds_name]['n_noise'] / len(preprocessed[ds_name]['X'])) * 100
    ax.scatter([best_eps], [best_noise_ratio], 
              s=200, marker='*', c='red', edgecolors='black', linewidth=1.5, zorder=5)
    ax.set_xlabel('eps')
    ax.set_ylabel('Noise ratio (%)')
    ax.set_title(f'{ds_name} - Noise vs eps')
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('artifacts/figures/dbscan_parameters_analysis.png', dpi=100)
plt.show()

print('сохранено: dbscan_parameters_analysis.png')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Metrics Comparison: KMeans vs DBSCAN', fontsize=14, fontweight='bold')

ds_names = list(preprocessed.keys())
x = np.arange(len(ds_names))
width = 0.35

km_sil = [metrics[d]['KMeans']['silhouette'] for d in ds_names]
db_sil = [metrics[d]['DBSCAN']['silhouette'] for d in ds_names]

axes[0].bar(x - width/2, km_sil, width, label='KMeans', alpha=0.8)
axes[0].bar(x + width/2, db_sil, width, label='DBSCAN', alpha=0.8)
axes[0].set_ylabel('Silhouette Score')
axes[0].set_title('Silhouette (higher is better)')
axes[0].set_xticks(x)
axes[0].set_xticklabels(ds_names)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

km_db = [metrics[d]['KMeans']['davies_bouldin'] for d in ds_names]
db_db = [metrics[d]['DBSCAN']['davies_bouldin'] for d in ds_names]

axes[1].bar(x - width/2, km_db, width, label='KMeans', alpha=0.8)
axes[1].bar(x + width/2, db_db, width, label='DBSCAN', alpha=0.8)
axes[1].set_ylabel('Davies-Bouldin Index')
axes[1].set_title('Davies-Bouldin (lower is better)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(ds_names)
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

km_ch = [metrics[d]['KMeans']['calinski_harabasz'] for d in ds_names]
db_ch = [metrics[d]['DBSCAN']['calinski_harabasz'] for d in ds_names]

axes[2].bar(x - width/2, km_ch, width, label='KMeans', alpha=0.8)
axes[2].bar(x + width/2, db_ch, width, label='DBSCAN', alpha=0.8)
axes[2].set_ylabel('Calinski-Harabasz Index')
axes[2].set_title('Calinski-Harabasz (higher is better)')
axes[2].set_xticks(x)
axes[2].set_xticklabels(ds_names)
axes[2].legend()
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('artifacts/figures/metrics_comparison.png', dpi=100)
plt.show()

print('сохранено: metrics_comparison.png')

In [None]:
print('\nПроверка устойчивости (Датасет 1 - KMeans):')

X = preprocessed['ds1']['X']
n_runs = 5
ari_scores = []
random_states = [42, 123, 456, 789, 999]

labels_all = []
for rs in random_states:
    km = KMeans(n_clusters=kmeans_results['ds1']['best_k'], random_state=rs, n_init=10)
    labels = km.fit_predict(X)
    labels_all.append(labels)

for i in range(1, len(labels_all)):
    ari = adjusted_rand_score(labels_all[0], labels_all[i])
    ari_scores.append(ari)
    print(f'  ARI(run0, run{i}): {ari:.4f}')

print(f'  Средний ARI: {np.mean(ari_scores):.4f}')
print(f'  Стд ARI: {np.std(ari_scores):.4f}')

In [None]:
print('\nАнализ датасетов и выводы:')

print('\nДатасет 1 (12000 строк, 8 столбец):')
print('  Структура: Хорошо разделённые кластеры')
print('  Лучший метод: KMeans (k=2)')
print('  Обоснование: Высокий силуэтный коэффициент (0.52), стабильная кластеризация.')
print('  Проблемы: Нет значительных проблем.')

print('\nДатасет 2 (8000 строк, 3 столбец):')
print('  Структура: Плотная, сложная структура')
print('  Лучший метод: DBSCAN')
print('  Обоснование: Естественно обрабатывает шум (~2%), гибкое обнаружение кластеров.')
print('  Проблемы: Чувствительность к параметру eps.')

print('\nДатасет 3 (15000 строк, 4 столбец):')
print('  Структура: Большой датасет с шумом (~21.6%)')
print('  Лучший метод: KMeans (k=3)')
print('  Обоснование: Более стабилен чем DBSCAN при наличии шума.')
print('  Проблемы: Высокое соотношение шума влияет на оба алгоритма.')

In [None]:
os.makedirs('artifacts', exist_ok=True)

metrics_summary = {}
for ds_name in preprocessed.keys():
    metrics_summary[ds_name] = {
        'size': {'samples': int(preprocessed[ds_name]['X'].shape[0]), 
                'features': int(preprocessed[ds_name]['X'].shape[1])},
        'KMeans': {
            'n_clusters': int(kmeans_results[ds_name]['best_k']),
            'silhouette': float(metrics[ds_name]['KMeans']['silhouette']),
            'davies_bouldin': float(metrics[ds_name]['KMeans']['davies_bouldin']),
            'calinski_harabasz': float(metrics[ds_name]['KMeans']['calinski_harabasz'])
        },
        'DBSCAN': {
            'eps': float(dbscan_results[ds_name]['best_params']['eps']),
            'min_samples': int(dbscan_results[ds_name]['best_params']['min_samples']),
            'n_clusters': int(metrics[ds_name]['DBSCAN']['n_clusters']),
            'n_noise': int(metrics[ds_name]['DBSCAN']['noise_points']),
            'noise_ratio': float(metrics[ds_name]['DBSCAN']['noise_ratio']),
            'silhouette': float(metrics[ds_name]['DBSCAN']['silhouette']),
            'davies_bouldin': float(metrics[ds_name]['DBSCAN']['davies_bouldin']),
            'calinski_harabasz': float(metrics[ds_name]['DBSCAN']['calinski_harabasz'])
        }
    }

with open('artifacts/metrics_summary.json', 'w') as f:
    json.dump(metrics_summary, f, indent=2)

print('сохранено: metrics_summary.json')

best_configs = {
    'ds1': {
        'algorithm': 'KMeans',
        'parameters': {'n_clusters': kmeans_results['ds1']['best_k']},
        'reason': f"Best silhouette score ({metrics['ds1']['KMeans']['silhouette']:.4f})"
    },
    'ds2': {
        'algorithm': 'DBSCAN',
        'parameters': dbscan_results['ds2']['best_params'],
        'reason': f"Better noise handling ({metrics['ds2']['DBSCAN']['noise_ratio']*100:.1f}% noise)"
    },
    'ds3': {
        'algorithm': 'KMeans',
        'parameters': {'n_clusters': kmeans_results['ds3']['best_k']},
        'reason': f"Stable with high noise ({metrics['ds3']['DBSCAN']['noise_ratio']*100:.1f}%)"
    }
}

with open('artifacts/best_configs.json', 'w') as f:
    json.dump(best_configs, f, indent=2)

print('сохранено: best_configs.json')

In [None]:
os.makedirs('artifacts/labels', exist_ok=True)

for i, ds_name in enumerate(['ds1', 'ds2', 'ds3'], 1):
    X = preprocessed[ds_name]['X']
    sample_id = preprocessed[ds_name]['sample_id']
    
    if i == 2:
        labels = dbscan_results[ds_name]['best_model'].fit_predict(X)
    else:
        labels = kmeans_results[ds_name]['best_model'].fit_predict(X)
    
    df_labels = pd.DataFrame({
        'sample_id': sample_id.values if hasattr(sample_id, 'values') else sample_id,
        'cluster_label': labels
    })
    
    df_labels.to_csv(f'artifacts/labels/labels_hw07_ds{i}.csv', index=False)
    print(f'сохранено: artifacts/labels/labels_hw07_ds{i}.csv')

print('\nВсе артефакты сохранены.')