In [1]:
import numpy as np
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Reading the Data

In [2]:
df = pd.read_csv('../data/MNISTonly0_1.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Separating X and y

In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Standardizing the data 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Without Pipeline 

In [5]:
pca = PCA(n_components=.90, random_state=42)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform (X_test)

clf = LogisticRegression(random_state=42)
clf.fit(X_train_pca, y_train)

print(classification_report(y_true=y_train, y_pred=clf.predict(X_train_pca)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1622
           1       1.00      1.00      1.00      1578

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200



In [6]:
pipe = Pipeline([('scaler', StandardScaler()),
('PCA', PCA(n_components=.90, random_state=42)),
('Logistic',LogisticRegression(random_state=42))])

pipe.fit(X_train, y_train)

print(f'Accuracy of pipeline on test set is {pipe.score(X_test, y_test):.3%}')

train_pred_pipe = pipe.predict(X_train)
test_pred_pipe = pipe.predict(X_test)

print('Classification report for train set using pipeline')
print(classification_report(y_true=y_train,y_pred=train_pred_pipe))

print('Classification report for test set using pipeline')
print(classification_report(y_true=y_test, y_pred=test_pred_pipe))

Accuracy of pipeline on test set is 99.875%
Classification report for train set using pipeline
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1622
           1       1.00      1.00      1.00      1578

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200

Classification report for test set using pipeline
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       378
           1       1.00      1.00      1.00       422

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800



In [7]:
from sklearn import set_config

set_config(display='diagram')
pipe