In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import time

# Загрузка данных
df = datasets.load_digits()
X = df.data
y = df.target

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
# Обучение моделей на исходных данных
models = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Decision Tree": DecisionTreeClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

SVM Accuracy: 0.9861
Logistic Regression Accuracy: 0.9722
Decision Tree Accuracy: 0.8306


In [3]:
# Снижение размерности с помощью PCA
pca = PCA(n_components=0.9)
start_time = time.time()
X_train_pca = pca.fit_transform(X_train)
pca_train_time = time.time() - start_time
start_time = time.time()
X_test_pca = pca.transform(X_test)
pca_test_time = time.time() - start_time

print(f"pca train time: {pca_train_time:.2f} seconds")
print(f"pca test time: {pca_test_time:.2f} seconds")
print(f"PCA number of components to describe 90% variance: {pca.n_components_}")

# Снижение размерности с помощью t-SNE
tsne = TSNE(n_components=2, random_state=42)
start_time = time.time()
X_tsne = tsne.fit_transform(df.data)
tsne_train_time = time.time() - start_time
X_train_tsne, X_test_tsne, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"t-SNE train time: {tsne_train_time:.2f} seconds")


pca train time: 0.03 seconds
pca test time: 0.00 seconds
PCA number of components to describe 90% variance: 21


found 0 physical cores < 1
  File "C:\Users\iarik\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


t-SNE train time: 8.65 seconds


In [4]:
# Обучение моделей на данных с пониженной размерностью (PCA)
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy with PCA: {accuracy:.4f}")

# Обучение моделей на данных с пониженной размерностью (t-SNE)
for name, model in models.items():
    model.fit(X_train_tsne, y_train)
    y_pred = model.predict(X_test_tsne)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy with t-SNE: {accuracy:.4f}")

SVM Accuracy with PCA: 0.9861
Logistic Regression Accuracy with PCA: 0.9583
Decision Tree Accuracy with PCA: 0.8472
SVM Accuracy with t-SNE: 0.9861
Logistic Regression Accuracy with t-SNE: 0.9722
Decision Tree Accuracy with t-SNE: 0.8500


<b>Вывод</b>
- Для SVM прироста/снижения метрики при снижении размерности не наблюдается
- Для Logistic Regression PCA повлиял негативно, метрика снизилась, TSNE никак не повлиял на результаты.
- Для Decision Tree PCA повлиял положительно, метрика выросла, а TSNE негативно, метрика снизилась

Для сохранения 90% дисперсии PCA необходима 21 компонента.

- PCA работал 0.04 секунды
- TSNE работал 3.36 секунды