# Week6

### 1) Load Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
wine = load_wine()
data = pd.DataFrame(wine.data, columns=wine.feature_names)

In [3]:
cls = pd.Series(wine.target, dtype='category')
data['class'] = cls

In [4]:
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


### 2) Logistic Regression Model

In [5]:
features=list(data.drop(['class'],axis=1).columns)
X = data.drop(['class'], axis=1).values
y = data['class'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
logistic_model = LogisticRegression(solver='sag', max_iter=100000)
logistic_model.fit(X_train, y_train)
pred = logistic_model.predict(X_test)
print('로지스틱회귀 모델의 Accuracy :', round(accuracy_score(y_test, pred), 3))

로지스틱회귀 모델의 Accuracy : 0.889


### 3) Logistic Regression Model with PCA

In [8]:
X = data.loc[:, features].values
y = data.loc[:, ['class']].values
X_scale = StandardScaler().fit_transform(X)

In [9]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scale)
data_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
data_pca = pd.concat([data[['class']], data_pca], axis=1)
data_pca.head()

Unnamed: 0,class,PC1,PC2
0,0,3.316751,-1.443463
1,0,2.209465,0.333393
2,0,2.51674,-1.031151
3,0,3.757066,-2.756372
4,0,1.008908,-0.869831


In [10]:
y_pca = data_pca['class'].values
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.3)

In [11]:
logistic_model_with_pca = LogisticRegression(solver='sag', max_iter=100000)
logistic_model_with_pca.fit(X_train_pca, y_train_pca)
pred_pca = logistic_model_with_pca.predict(X_test_pca)
print('PCA를 이용한 변수선택 이후 로지스틱회귀 모델의 Accuracy :', round(accuracy_score(y_test_pca, pred_pca), 3))

PCA를 이용한 변수선택 이후 로지스틱회귀 모델의 Accuracy : 0.963


### 4) Conclusion

로지스틱회귀 모델만 가지고 분류하는 것보다 PCA로 변수를 추린 뒤 로지스틱회귀 모델로 분류하는 것이 조금 더 높은 분류 정확도를 보였다.