# PCA With Diabetes Dataset

In [None]:
# Imports packages
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
# Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/mpHarm88/datasets/master/diabetes.csv")
# show the 5 top rows
df.head()

## Variables Definitions:
- Pregnancies: Number of times pregnant
- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- BloodPressure: Diastolic blood pressure (mm Hg)
- SkinThickness: Triceps skin fold thickness (mm)
- Insulin: 2-Hour serum insulin (mu U/ml)
- BMI: Body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: Diabetes pedigree function
- Age: Age (years)
- Outcome: Class variable (0 or 1)

## Split Data

In [None]:
# split dataset in features and target variable
feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
X = df[feature_cols] # Features
y = df.Outcome # Target variable

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

## EDA

In [None]:
# describe X_train dataset
df.describe()

In [None]:
# info about X_train dataset
df.info()
df.shape

In [None]:
# is there a missing values in X_train dataset
df.isnull().sum()

In [None]:
plt.figure(figsize=(7,7))
plt.pie(df['Outcome'].value_counts(sort = True),
        labels=['Non-Diabetic', 'Diabetic'],
        autopct='%1.1f%%', 
       textprops={'fontsize': 14})

plt.title('Non-Diabetic vs Diabetic', fontsize =14)
plt.show()

In [None]:
sns.displot(df, x="Age", hue="Outcome", kind="kde", legend=False)
plt.legend(title='diabetic ?', labels=['No', 'Yes'])
plt.title("Distribution of Ages, by Diabetes");

We can see from the plot that the majority of patients with diabetes are between 10 to 35 years old.

In [None]:
sns.displot(df, x="Pregnancies", hue="Outcome", kind="kde", legend=False)
plt.legend(title='diabetic ?', labels=['No', 'Yes'])
plt.title("Distribution of Ages, by Diabetes");

We can see from the graph that the majority of patients with diabetes have children between 0 to 9 kids.

In [None]:
# scatter plot all the variables in X_train dataset
sns.pairplot(df, corner=True);

In [None]:
df.hist(figsize = (15, 15),alpha=0.7, rwidth=0.85);

In [None]:
# heat-map for all variables in X_train dataset to find the relationship between the variables
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, mask=np.triu(df.corr()))

There is an medium relationship between age and number of pregnancies, And between Inslin and SkinThickness, And between Inslin and Outcome the rest of variables they have a weak relationships.

There is a relationship between age and the number of pregnancies, and this makes sense
The higher the age, the higher the number of pregnancies.

The following figure supports our conclusion here.

In [None]:
plt.figure(figsize = (16, 6))
sns.barplot(x = 'Age', y = 'Pregnancies', data = df)
plt.xticks(rotation = 60)
plt.title(" How many times do you get pregnant At Every Age");

In [None]:
plt.figure(figsize = (16, 6))
# plot Attrition by Gender
sns.countplot(x = "Age", data=df);
plt.title(" The Total Number Of People At Every Age");

In this dataset, we find that the majority of patients are between 21 and 29 years old

In [None]:
sns.displot(df, x="BMI", kind="kde", fill=True)
# plot title
plt.title("Distribution of BMI");

In these data we find that the majority of patients have a BMI between 20 and 50.

In [None]:
plt.figure(figsize = (12, 10))
sns.boxplot(x="variable", y="value", data=pd.melt(df))
plt.title("Boxplot For All Columns");

## Baseline Model

In [None]:
# baseline (majority class)
y_train.value_counts(normalize=True)

## Scale Data

In [None]:
# Instantiate Standard Scaler
scaler = StandardScaler()
# Fit & transform data.
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
# Convert numpy.ndarray to dataframe
df_Xtrain_scale = pd.DataFrame(scaler.fit_transform(X_train), columns = ['Pregnancies', 'Glucose', 'BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])
df_Xtrain_scale.describe()

In [None]:
# show the columns after the scaling
plt.figure(figsize = (12, 10))
sns.boxplot(x="variable", y="value", data=pd.melt(df_Xtrain_scale))
plt.title("Boxplot For All Columns After Applying StandardScaler");

## Logistic Regression Model

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.classes_

In [None]:
lr.intercept_

In [None]:
lr.coef_

In [None]:
# Confusion matrix for train dataset
plot_confusion_matrix(lr, X_train, y_train);

### Evaluate Model

In [None]:
print(f"Training accuracy: {lr.score(X_train, y_train)}")
print(f"Testing accuracy: {lr.score(X_test, y_test)}")

In [None]:
print(classification_report(y_train, lr.predict(X_train)))

### Compare Logistic Regression Model Results to Baseline Model
From the report above we find that the **Logistic Regression Model** performed better than **Baseline Model**

## Perform PCA

In [None]:
# Instantiate & fit data using PCA
pca = PCA()
pca.fit(X_train_sc)

In [None]:
plt.style.use("seaborn")
plt.figure(figsize=(15,8))
exp_var = pd.Series(pca.explained_variance_ratio_) 
exp_var.plot(kind="bar", alpha=0.7) # plot bar chart

# Calculate the amount of variance explained added by each additional component
total = 0
var_ls = []
for x in exp_var:
    total = total+x
    var_ls.append(total)

pd.Series(var_ls).plot(marker="o", alpha=0.7) #plot line chart of increasing variances
plt.xlabel("Principle Components", fontsize="x-large")
plt.ylabel("Percentage Variance Explained", fontsize="x-large")
plt.title("Diabetes Scree Plot", fontsize="xx-large")
plt.show();

In [None]:
# fit PCA
pca = PCA(n_components = 4)
X_train_pca = pca.fit_transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

lr.fit(X_train_pca, y_train)

### Evaluate Model

In [None]:
print(f"Training accuracy: {lr.score(X_train_pca, y_train)}")
print(f"Testing accuracy: {lr.score(X_test_pca, y_test)}")

As we can see that the score decreased when we apply the **PCA** on the **Logistic Regression Model**, We tested all numbers for `n_components`, not just 4, but all results were less than **Logistic Regression Model**.

But for the **Baseline** we find that **PCA** performs better.

In conclusion, the best performance was for the **Logistic Regression Model**, then the **PCA**, then the **Baseline**.