In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import os
NUM_THREADS = "1"
os.environ["OMP_NUM_THREADS"] = NUM_THREADS
os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
os.environ["MKL_NUM_THREADS"] = NUM_THREADS
os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
from sklearn.datasets import make_blobs, make_moons, load_digits, load_sample_image
from sklearn.metrics import pairwise_distances_argmin, accuracy_score, confusion_matrix
from sklearn.cluster import SpectralClustering, KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
from scipy.stats import mode
import warnings; warnings.simplefilter('ignore')

data = pd.read_csv('Country-data.csv')

print(f"Перевірка загальної інформації про дані:\n")
print(data.info())
print("")
print(f"Перевірка на пропущені значення:\n")
print(data.isnull().sum())

# Очищення даних 
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Однофакторний аналіз
print(f"Однофакторний аналіз:\n")
for column in data.columns[1:]: 
    plt.figure(figsize=(8, 6))
    sns.histplot(data[column], bins=20, kde=True)
    plt.title(f'{column} distribution')
    plt.xlabel(column)
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.boxplot(x=column, data=data)
    plt.title(f'{column} boxplot')
    plt.xlabel(column)
    plt.show()

# Двофакторний аналіз
print(f"Двофакторний аналіз:\n")
plt.figure(figsize=(8, 6))
sns.scatterplot(x='gdpp', y='child_mort', data=data)
plt.title('GDP per Capita vs Child Mortality Rate')
plt.xlabel('GDP per Capita')
plt.ylabel('Child Mortality Rate')
plt.show()

# Кореляційна матриця
print(f"Кореляційна матриця:\n")
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Кластеризація методом k-means
features = ['gdpp', 'child_mort', 'income']
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data[features])
    inertia.append(kmeans.inertia_)
print(f"Кластеризація методом k-means:\n")
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(data[features])

# Візуалізація результатів кластеризації
print(f"Візуалізація результатів кластеризації:\n")
plt.figure(figsize=(8, 6))
sns.scatterplot(x='gdpp', y='child_mort', data=data, hue='Cluster', palette='viridis')
plt.title('Clusters based on GDP per Capita and Child Mortality Rate')
plt.xlabel('GDP per Capita')
plt.ylabel('Child Mortality Rate')
plt.legend(title='Cluster')
plt.show()

print(f"Summary k-means:\n")
cluster_centers = kmeans.cluster_centers_
cluster_summary = pd.DataFrame(cluster_centers, columns=features)
print(cluster_summary)

In [None]:
data = pd.read_csv('Country-data.csv')
columns_with_outliers = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

# функція для виявлення викидів за міжквартильним діапазоном (IQR)
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# пошук викидів для кожного стовпця
outliers_info = {}
for col in columns_with_outliers:
    outliers = detect_outliers(data, col)
    if not outliers.empty:
        outliers_info[col] = outliers
for col, outliers in outliers_info.items():
    print(f"Викиди для '{col}':")
    print(outliers)
    print('\n')

In [None]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

data = pd.read_csv('Country-data.csv')
df = pd.DataFrame(data)
features = df.drop('country', axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

k = 3

# модель k-means
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(scaled_features)
df['kmeans_cluster'] = kmeans.labels_
kmeans_silhouette = silhouette_score(scaled_features, kmeans.labels_)

# модель DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(scaled_features)
df['dbscan_cluster'] = dbscan.labels_

# візуалізація результатів для k-means
fte_colors = {0: "#008fd5", 1: "#fc4f30", 2: "#e5ae38"}
cluster_colors = [fte_colors[label] for label in df['kmeans_cluster']]
plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=cluster_colors)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'K-means Clustering (k={k}) - Silhouette: {kmeans_silhouette:.2f}')
plt.show()

# візуалізація результатів для DBSCAN
unique_labels = set(dbscan.labels_)
colors = [plt.cm.Spectral(each) for each in range(len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1]
    class_member_mask = (dbscan.labels_ == k)
    xy = scaled_features[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14)
plt.title(f'DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

data = pd.read_csv('Country-data.csv')
features = data.drop('country', axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# визначення оптимального k за допомогою kneed
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing")
optimal_k = kl.elbow

plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method for Optimal k')
plt.vlines(optimal_k, plt.ylim()[0], plt.ylim()[1], linestyles='dashed', colors='red', label=f'Optimal k: {optimal_k}')
plt.legend()
plt.show()

print(f'Optimal k is: {optimal_k}')

In [None]:

# кластеризація з оптимальним k
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(scaled_features)
df['cluster'] = kmeans.labels_

plt.figure(figsize=(8, 6))
plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.5)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='*', label='Centroids')
plt.title(f'Clustering Results (k={optimal_k})')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()

print(f'Optimal k for clustering: {optimal_k}')

In [None]:
from tabulate import tabulate

all_features = data.columns[1:]

summary_all = pd.DataFrame(index=all_features, columns=['Mean', 'Min', 'Max'])
for feature in all_features:
    feature_mean = data[feature].mean()
    feature_min = data[feature].min()
    feature_max = data[feature].max()

    summary_all.loc[feature] = [feature_mean, feature_min, feature_max]

print("Summary для всіх факторів:\n")
print(tabulate(summary_all, headers='keys', tablefmt='pretty'))