
# PCA

### About Dataset


| ID |	Identification|
|:----------|:-----------|
|Group       	|Demented or Nondemented|
|Visit 	       |The visit number|
|M/F 	|          Gender|
|Hand           |Dominant Hand|
|Age| 	Age in years|
|Educ 	|Years of Education|
|SES| 	Socioeconomic Status|
|MMSE 	|Mini Mental State Examination|
|CDR| 	Clinical Dementia Rating|
|eTIV 	|Estimated Total Intracranial Volume|
|nWBV| 	Normalize Whole Brain Volume|
|ASF| 	Atlas Scaling Factor|
|Delay| 	Delay|

### Importing libraries


In [None]:
import pandas as pd
import numpy as np

### Load the data

In [2]:
df= pd.read_csv ("Alzheimer_Dataset.csv")
df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,Gender,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Demented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Demented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


### Check for any missing data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  504 non-null    object 
 1   MRI ID      504 non-null    object 
 2   Group       504 non-null    object 
 3   Visit       504 non-null    int64  
 4   MR Delay    504 non-null    int64  
 5   Gender      504 non-null    object 
 6   Hand        504 non-null    object 
 7   Age         504 non-null    int64  
 8   EDUC        504 non-null    int64  
 9   SES         480 non-null    float64
 10  MMSE        502 non-null    float64
 11  CDR         504 non-null    float64
 12  eTIV        504 non-null    int64  
 13  nWBV        504 non-null    float64
 14  ASF         504 non-null    float64
dtypes: float64(5), int64(5), object(5)
memory usage: 59.2+ KB


In [4]:
df.isna().sum()

Subject ID     0
MRI ID         0
Group          0
Visit          0
MR Delay       0
Gender         0
Hand           0
Age            0
EDUC           0
SES           24
MMSE           2
CDR            0
eTIV           0
nWBV           0
ASF            0
dtype: int64

In [5]:
df['SES'].value_counts()

2.0    137
3.0    118
1.0    113
4.0    103
5.0      9
Name: SES, dtype: int64

In [6]:
df['SES'] = df['SES'].fillna(3.0)
df['MMSE'].fillna(df.MMSE.mean(),inplace=True)
df.isna().sum().sum()
### There are no missing values in the data.

0

### columns selecting

In [7]:
df.drop(["MRI ID","Visit","Subject ID","MR Delay","Hand"],axis='columns',inplace=True)
df.head()

Unnamed: 0,Group,Gender,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Demented,M,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Demented,M,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,Demented,M,75,12,3.0,23.0,0.5,1678,0.736,1.046
3,Demented,M,76,12,3.0,28.0,0.5,1738,0.713,1.01
4,Demented,M,80,12,3.0,22.0,0.5,1698,0.701,1.034


### Since the Group column as a categorical data, we need to encode it to numerical values.

In [8]:
df['Group'] = df['Group'].apply(lambda x: 1 if x == 'Demented' else 0)
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'F' else 0)
print(df.dtypes)
df.head()

Group       int64
Gender      int64
Age         int64
EDUC        int64
SES       float64
MMSE      float64
CDR       float64
eTIV        int64
nWBV      float64
ASF       float64
dtype: object


Unnamed: 0,Group,Gender,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,1,0,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,1,0,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,1,0,75,12,3.0,23.0,0.5,1678,0.736,1.046
3,1,0,76,12,3.0,28.0,0.5,1738,0.713,1.01
4,1,0,80,12,3.0,22.0,0.5,1698,0.701,1.034


### creating features and label

In [9]:
x = df[[ 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV','ASF',"Gender"]]
Y = df['Group']
x.head()

Unnamed: 0,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Gender
0,87,14,2.0,27.0,1987,0.696,0.883,0
1,88,14,2.0,30.0,2004,0.681,0.876,0
2,75,12,3.0,23.0,1678,0.736,1.046,0
3,76,12,3.0,28.0,1738,0.713,1.01,0
4,80,12,3.0,22.0,1698,0.701,1.034,0


###  Standardization

In [11]:
X=(x-np.min(x))/(np.max(x)-np.min(x))

  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


###  covariance matrix and  Selecting Principal Components

In [13]:
from sklearn.decomposition import PCA

for i in range(2,x.shape[1]+1):
    pca=PCA(n_components=i)
    pca.fit(x)
    x_pca=pca.transform(x)
    print("varyans:",i,pca.explained_variance_ratio_)
    print("toplam varyans:",i,sum(pca.explained_variance_ratio_))
    print("-------------------------------------")

varyans: 2 [0.99735355 0.00185167]
toplam varyans: 2 0.9992052255240031
-------------------------------------
varyans: 3 [9.97353552e-01 1.85167320e-03 5.14132853e-04]
toplam varyans: 3 0.9997193583773799
-------------------------------------
varyans: 4 [9.97353552e-01 1.85167320e-03 5.14132853e-04 2.57944487e-04]
toplam varyans: 4 0.9999773028647079
-------------------------------------
varyans: 5 [9.97353552e-01 1.85167320e-03 5.14132853e-04 2.57944487e-04
 1.77683504e-05]
toplam varyans: 5 0.9999950712151496
-------------------------------------
varyans: 6 [9.97353552e-01 1.85167320e-03 5.14132853e-04 2.57944487e-04
 1.77683504e-05 4.89066231e-06]
toplam varyans: 6 0.9999999618774578
-------------------------------------
varyans: 7 [9.97353552e-01 1.85167320e-03 5.14132853e-04 2.57944487e-04
 1.77683504e-05 4.89066231e-06 2.48105135e-08]
toplam varyans: 7 0.9999999866879707
-------------------------------------
varyans: 8 [9.97353552e-01 1.85167320e-03 5.14132853e-04 2.57944487e-04


In [36]:
# Perform PCA to reduce the number of features
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

### splitting data into training and test set

In [37]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (X_pca,Y,test_size=0.2, random_state=1)

### KNN Classifier

In [40]:
# Create and fit the KNN classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)


### fitting data to model

In [41]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

### Model evaluation

In [39]:
# Evaluate the model on the test set
accuracy = knn.score(x_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7623762376237624
