In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import matplotlib as plt

In [83]:
diabetes_dataset= pd.read_csv(r"D:\PythonProject\AICTE_Prediction_Practical\Datasets\diabetes.csv")

In [84]:
diabetes_dataset['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [85]:
X= diabetes_dataset.drop(columns=['Outcome'], axis=1)
Y= diabetes_dataset['Outcome']

In [86]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size= 0.2, random_state= 42) 

In [87]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [88]:
scaler.fit(X_train)

In [89]:
X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

In [90]:
print(X_train)

[[-0.52639686 -1.15139792 -3.75268255 ... -4.13525578 -0.49073479
  -1.03594038]
 [ 1.58804586 -0.27664283  0.68034485 ... -0.48916881  2.41502991
   1.48710085]
 [-0.82846011  0.56687102 -1.2658623  ... -0.42452187  0.54916055
  -0.94893896]
 ...
 [ 1.8901091  -0.62029661  0.89659009 ...  1.76054443  1.981245
   0.44308379]
 [-1.13052335  0.62935353 -3.75268255 ...  1.34680407 -0.78487662
  -0.33992901]
 [-1.13052335  0.12949347  1.43720319 ... -1.22614383 -0.61552223
  -1.03594038]]


In [91]:
# from imblearn.under_sampling import RandomUnderSampler

# # Applying Random Over Sampling
# rus = RandomUnderSampler(random_state=42)
# X_resampled_train, y_resampled_train = rus.fit_resample(X_train, Y_train)

In [92]:
from imblearn.under_sampling import TomekLinks

tomek = TomekLinks(sampling_strategy='majority')  # Removes the majority class samples that are near the minority class
X_resampled_train, y_resampled_train = tomek.fit_resample(X_train, Y_train)

In [93]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_resampled_train, y_resampled_train = smote.fit_resample(X_train, Y_train)

In [94]:
X_resampled_train.shape

(579, 8)

In [95]:
pd.Series(y_resampled_train).value_counts()

Outcome
0    366
1    213
Name: count, dtype: int64

In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [97]:
model1= svm.SVC(kernel= 'linear')
model2 = LogisticRegression()
model3 = DecisionTreeClassifier()
model4= RandomForestClassifier(max_features=5)

In [98]:
model1.fit(X_resampled_train, y_resampled_train)
model2.fit(X_resampled_train, y_resampled_train)
model3.fit(X_resampled_train, y_resampled_train)
model4.fit(X_resampled_train, y_resampled_train)

In [99]:
y_pred1= model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)
y_pred4 = model4.predict(X_test)

In [100]:
print("Accuracy of Support Vector Machine",accuracy_score(Y_test, y_pred1))
print("Accuracy of Logistic Regression",accuracy_score(Y_test,y_pred2))
print("Accuracy of Decision Trees",accuracy_score(Y_test,y_pred3))
print("Accuracy of Random Forest",accuracy_score(Y_test,y_pred4))

Accuracy of Support Vector Machine 0.7272727272727273
Accuracy of Logistic Regression 0.7402597402597403
Accuracy of Decision Trees 0.7272727272727273
Accuracy of Random Forest 0.7532467532467533


In [101]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix of Support Vector Machine")
confusion_matrix(Y_test,y_pred1)

Confusion Matrix of Support Vector Machine


array([[75, 24],
       [18, 37]])

In [102]:
from sklearn.metrics import classification_report
print("Support Vector Machine\n",classification_report(Y_test,y_pred1))

Support Vector Machine
               precision    recall  f1-score   support

           0       0.81      0.76      0.78        99
           1       0.61      0.67      0.64        55

    accuracy                           0.73       154
   macro avg       0.71      0.72      0.71       154
weighted avg       0.74      0.73      0.73       154



In [103]:
print("Confusion Matrix of Logistic regression")
confusion_matrix(Y_test,y_pred2)

Confusion Matrix of Logistic regression


array([[76, 23],
       [17, 38]])

In [104]:
print("Logistic Recgression\n",classification_report(Y_test,y_pred2))

Logistic Recgression
               precision    recall  f1-score   support

           0       0.82      0.77      0.79        99
           1       0.62      0.69      0.66        55

    accuracy                           0.74       154
   macro avg       0.72      0.73      0.72       154
weighted avg       0.75      0.74      0.74       154



In [105]:
print("Confusion Matrix of Decesion Tree")
confusion_matrix(Y_test,y_pred3)

Confusion Matrix of Decesion Tree


array([[75, 24],
       [18, 37]])

In [106]:
print("Decesion Tree\n",classification_report(Y_test,y_pred3))

Decesion Tree
               precision    recall  f1-score   support

           0       0.81      0.76      0.78        99
           1       0.61      0.67      0.64        55

    accuracy                           0.73       154
   macro avg       0.71      0.72      0.71       154
weighted avg       0.74      0.73      0.73       154



In [107]:
print("Confusion Matrix of Random Forest")
confusion_matrix(Y_test,y_pred4)

Confusion Matrix of Random Forest


array([[76, 23],
       [15, 40]])

In [108]:
print("Random Forest\n",classification_report(Y_test,y_pred4))

Random Forest
               precision    recall  f1-score   support

           0       0.84      0.77      0.80        99
           1       0.63      0.73      0.68        55

    accuracy                           0.75       154
   macro avg       0.74      0.75      0.74       154
weighted avg       0.76      0.75      0.76       154

