In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1
angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

X.shape

In [None]:
X_cen = X - X.mean(axis=0)  # scaling
X_cov = np.dot(X_cen.T, X_cen) / 59 # covariance matrix

print(X_cov)

In [None]:
w, v = np.linalg.eig(X_cov)

print('eigenvalue :', w)
print('eigenvector :', v)

In [None]:
print('explained variance ratio :', w / w.sum())

In [None]:
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1
angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

X.shape

In [None]:

X_cen = X - X.mean(axis=0)  # scaling

print(X[0:5])
print(X_cen[0:5])

In [None]:
U, D, V_t = np.linalg.svd(X_cen)

print('singular value :', D)
print('singular vector :\n', V_t.T)

In [None]:
print('explained variance ratio :', D ** 2 / np.sum(D**2))

In [None]:
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names']+['target'])
df['target'] = df['target'].map({0:'setosa', 1:'versicolor', 2:'virginica'})
print(df.head())
print(df.shape)
print(df.describe())

In [None]:
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
X = df.loc[:, features].values
y = df.loc[:, ['target']].values

X = StandardScaler().fit_transform(X) # Standardization using Standard Scaler
finalDf = pd.DataFrame(data=X, columns=features)
finalDf.head()

In [None]:
covariance_matrix = np.cov(X.T)

print(covariance_matrix)

In [None]:
eig_vals, eig_vecs = np.linalg.eig(covariance_matrix)

print('eigenvalue :', eig_vals) # eigen values
print('eigenvector :', eig_vecs) # eigen vectors

In [None]:
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X) 

df_pca = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])
df_pca = pd.concat([df_pca, df[['target']]], axis=1)

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x = df_pca['Principal Component 1'], y = df_pca['Principal Component 2'], hue = df_pca['target']);

In [None]:
X = df.loc[:, features].values
fa = FactorAnalysis(n_components=10)
X_fa = fa.fit_transform(X)
df_fa = pd.DataFrame(data=X_fa, columns=['Factor1', 'Factor2', 'Factor3', 'Factor4'])
df_fa.head()

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x = df_fa['Factor1'], y = df_fa['Factor2'], hue=df['target']);

In [None]:
!pip install factor_analyzer

In [None]:
from factor_analyzer import FactorAnalyzer

In [None]:
%cd C:\Users\Yoonk\Desktop\week6

In [None]:
fpath = './bfi.csv'
df = pd.read_csv(fpath, index_col=0)
df.head()

In [None]:
df.drop(['gender', 'education', 'age'], axis=1, inplace=True)
df.head()

In [None]:
df.dropna(inplace=True)
fa = FactorAnalyzer(n_factors=6, rotation='varimax')
fa.fit(df)
ev, v = fa.get_eigenvalues()

In [None]:
xvals = range(1, df.shape[1]+1)

plt.scatter(xvals, ev)
plt.plot(xvals, ev)
plt.title('Scree Plot')
plt.xlabel('Factor')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

In [None]:
wine = pd.read_csv('Wine.csv', header=None)
wine.head()

In [None]:
X, y = wine.iloc[:, 1:], wine.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

lr = LogisticRegression()
lr_clf = Pipeline([('scaler', StandardScaler()), ('lr', lr)])
lr_clf_pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                       ('lr', lr)])
lr_clf_fa = Pipeline([('scaler', StandardScaler()), ('fa', FactorAnalysis(n_components=2)),
                       ('lr', lr)])

In [None]:
from sklearn.metrics import accuracy_score

clfs = [("Logistic", lr_clf), ("PCA", lr_clf_pca), ("FA", lr_clf_fa)]
for clf_name, clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{clf_name} : {accuracy_score(y_test, y_pred)}")