# PCA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('insurance.csv')

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import classification_report,confusion_matrix

In [32]:
X = df.iloc[:,:-1]   # all column and  minus last column
Y = df.iloc[:,-1]    #add all rows and last column

In [33]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=1)

In [34]:
def classify(model,xtrain=X_train,xtest=X_test):
    model.fit(xtrain,y_train)
    y_pred = model.predict(xtest)
    print(classification_report(y_test,y_pred))

In [35]:
lr = LogisticRegression(random_state=1)

In [36]:
classify(lr)



              precision    recall  f1-score   support

           0       0.77      0.70      0.74       157
           1       0.82      0.87      0.84       245

    accuracy                           0.80       402
   macro avg       0.80      0.79      0.79       402
weighted avg       0.80      0.80      0.80       402



In [37]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

In [38]:
test =  SelectKBest(score_func=chi2,k=5)    # k means 5 features;  chi2 means chisquare 

In [40]:
X_train_chi = test.fit_transform(X_train,y_train)
# for test data
X_test_chi  = test.transform(X_test)

In [41]:
classify(lr,X_train_chi,X_test_chi)

              precision    recall  f1-score   support

           0       0.77      0.69      0.73       157
           1       0.81      0.87      0.84       245

    accuracy                           0.80       402
   macro avg       0.79      0.78      0.78       402
weighted avg       0.80      0.80      0.80       402





In [43]:
# which features are selected
test.get_support()     #here 1,3,4,5,7 feature selected

array([ True, False,  True,  True,  True, False,  True])

# ANOVA

In [52]:
test =  SelectKBest(score_func=f_regression,k=5)    # k means 5 features;  using anova 

In [53]:
X_train_anova = test.fit_transform(X_train,y_train)
# for test data
X_test_anova  = test.transform(X_test)

In [54]:
classify(lr,X_train_anova,X_test_anova)

              precision    recall  f1-score   support

           0       0.77      0.69      0.73       157
           1       0.81      0.87      0.84       245

    accuracy                           0.80       402
   macro avg       0.79      0.78      0.78       402
weighted avg       0.80      0.80      0.80       402





# PCA

In [56]:
from sklearn.decomposition import PCA

In [58]:
pc = PCA(n_components=5,random_state=1)

In [61]:
X_train_pc = pc.fit_transform(X_train,y_train)
# for test data
X_test_pc  = pc.transform(X_test)

In [62]:
classify(lr,X_train_pc,X_test_pc)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       157
           1       0.89      0.89      0.89       245

    accuracy                           0.87       402
   macro avg       0.86      0.86      0.86       402
weighted avg       0.87      0.87      0.87       402





In [63]:
pc.components_        #principal components 

array([[ 3.47149728e-04,  2.02993643e-06,  9.76283845e-05,
         3.28579817e-06,  2.64211095e-05, -3.42637708e-07,
         9.99999935e-01],
       [-9.99306837e-01,  3.96547954e-04, -3.63219499e-02,
        -6.54221995e-04,  7.98074855e-03, -1.46982768e-03,
         3.50245154e-04],
       [-3.64157326e-02,  3.20701132e-03,  9.98771715e-01,
        -2.82654855e-03, -8.63765073e-03,  3.21879633e-02,
        -8.46247403e-05],
       [-8.62074412e-04,  5.91417041e-03, -1.12058201e-03,
         9.92983639e-01, -1.02708502e-02,  1.17647890e-01,
        -2.55440201e-06],
       [ 1.34004096e-04,  1.28479567e-02,  3.21883217e-02,
         1.17432321e-01, -1.05127579e-02, -9.92420252e-01,
        -3.66323489e-06]])