# PCA + Logistic Regression

#### Александр Широков ПМ-1701

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from time import time
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Mnist Download

In [3]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', return_X_y=True)
y = y.astype(int)

## Tests

In [11]:
result = {'Variance Retained' : [], 
          'Number of Components' : [], 
          'Time (seconds)' : [], 
          'Accuracy' : []}
step = 1
for var in [0.75, 0.8, 0.85, 0.9, 0.95, 1]:
    if var==1:
        print(f'{var}, step={step}')
        n = 60000
        X_train, y_train, X_test, y_test = X[:n], y[:n], X[n:], y[n:]
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        cls = LogisticRegression(penalty='l2')
        t = %timeit -n1 -r 1 -o -q cls.fit(X_train, y_train)
        y_pred = cls.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        result['Variance Retained'].append(1)
        result['Number of Components'].append(784)
        result['Time (seconds)'].append(t.average)
        result['Accuracy'].append(accuracy)
        step +=1
    else:   
        print(f'{var}, step={step}')
        pca = PCA(n_components=var, svd_solver = 'full')
        pca_X = pca.fit_transform(X)
        n = 60000
        X_train, y_train, X_test, y_test = pca_X[:n], y[:n], pca_X[n:], y[n:]
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        cls = LogisticRegression(penalty='l2')
        t = %timeit -n1 -r 1 -o -q cls.fit(X_train, y_train)
        y_pred = cls.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        result['Variance Retained'].append(var)
        result['Number of Components'].append(pca.n_components_)
        result['Time (seconds)'].append(t.average)
        result['Accuracy'].append(accuracy)
        step +=1

0.75, step=1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8, step=2
0.85, step=3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9, step=4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.95, step=5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1, step=6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Results

In [12]:
result = pd.DataFrame(result)
result.index.name = '№ of Experiement'
result

Unnamed: 0_level_0,Variance Retained,Number of Components,Time (seconds),Accuracy
№ of Experiement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.75,33,6.298766,0.9014
1,0.8,43,6.358118,0.9101
2,0.85,59,7.501682,0.9166
3,0.9,87,8.393642,0.9196
4,0.95,154,11.260587,0.9235
5,1.0,784,43.895359,0.9246


## Check KNN Classifiction

In [13]:
from knn import KNNClassifier, BatchedKNNClassifier
from knn.distances import euclidean_distance, cosine_distance
from knn.model_selection import knn_cross_val_score
from knn.classification import BatchedMixin
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, BaseCrossValidator

In [15]:
best_method = 'brute'
metrics = ['euclidean', 'cosine']
k_list = np.arange(1, 11)
cross_validation_result_time = {m: 'mean_time' for m in metrics}
cross_validation_result_accuracy = {m: 0 for m in metrics}
for i in cross_validation_result_time.keys():
    print(f'Metric: {i}')
    start_timer = time()
    res = knn_cross_val_score(X, y, 
                              k_list=k_list,
                              scoring='accuracy',
                              cv=KFold(n_splits=3),
                              batch_size=1000,
                              algorithm=best_method,
                              metric=i)
    end_timer = time()
    cross_validation_result_time[i] = end_timer - start_timer
    cross_validation_result_accuracy[i] = res

Metric: euclidean
Metric: cosine


In [16]:
cross_validation_result_accuracy_mean = {k: {r: np.mean(f) for r, f in acc.items()} for k, acc in cross_validation_result_accuracy.items()}

In [17]:
df_cross_validation_result_time = pd.DataFrame(cross_validation_result_time, index = ['time'])
df_cross_validation_result_time

Unnamed: 0,euclidean,cosine
time,273.449266,290.424443


In [18]:
df_cross_validation_result_accuracy_mean = pd.DataFrame(cross_validation_result_accuracy_mean)
df_cross_validation_result_accuracy_mean

Unnamed: 0,euclidean,cosine
1,0.967871,0.9721
2,0.962029,0.969014
3,0.968786,0.9731
4,0.967214,0.972314
5,0.967357,0.9724
6,0.9664,0.972014
7,0.966729,0.970929
8,0.965671,0.970714
9,0.964814,0.969886
10,0.964543,0.969729


**Вывод**: действительно, `PCA` позволяет дать выигрыш в памяти (по сравнению с `Batched Knn`) - почти в `6-7` раз, но проседает по точности. Чтобы выбирать - остаётся за нами.