# Diabetes Dataset Modeling

## setup

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

Matplotlib is building the font cache; this may take a moment.


# Loading Clean Data

In [2]:
df = pd.read_csv("outputDiab.csv")

In [3]:
df.head()

Unnamed: 0,Outcome,Pregnancies_transformed,Glucose_transformed,BloodPressure_transformed,SkinThickness_transformed,BMI_transformed,DiabetesPedigreeFunction_transformed,Age_transformed
0,1,44.444444,67.096774,49.410379,30.434783,48.049922,48.605578,63.736264
1,0,7.407407,26.451613,41.336819,23.913043,26.209048,24.169987,21.978022
2,1,59.259259,89.677419,38.645632,13.373447,15.912637,52.589641,24.175824
3,0,7.407407,29.032258,41.336819,17.391304,30.889236,7.879593,0.0
4,1,0.0,60.0,6.351391,30.434783,77.691108,17.67656,26.373626


In [4]:
df.shape

(728, 8)

In [5]:
x = df[df.columns.drop('Outcome')]
y = df['Outcome']

# Scaling the Data

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [24]:
scale = MinMaxScaler()
x_scaled = scale.fit_transform(x)
x = x_scaled

# Train-Test Split

In [25]:
from sklearn.model_selection import train_test_split 

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Balancing Data (using oversampling)

In [33]:
from imblearn.over_sampling import RandomOverSampler

In [38]:
ros = RandomOverSampler(random_state = 0)
x_new, y_new = ros.fit_resample(x_train, y_train)
x = x_new; y = y_new

# Modeling (1. DecisionTreeClassifier)

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
tree_clf = DecisionTreeClassifier(max_depth = 2)
tree_clf.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=2)

### Cross-Val

In [48]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

In [59]:
sum(cross_val_score(tree_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5
# y_scores = cross_val_predict(tree_clf, x_train, y_train, cv = 3, method="decision_function")

0.71482463896257

# ROC Curve

In [46]:
from sklearn.metrics import roc_curve

In [47]:
def roc_curve_plot(y_train, y_score, label = None) : 
    fpr, tpr, thresholds = roc_curve(y_train, y_score)
    
    plt.plot(fpr, tpr, linewidth = 2,label = label)
    plt.plot([0, 1], [0, 1], 'k--')

# Modeling(2. Logistic Regression)

In [55]:
from sklearn.linear_model import LogisticRegression

In [56]:
log_clf = LogisticRegression()
log_clf.fit(x_train, y_train)

LogisticRegression()

In [67]:
sum(cross_val_score(log_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5

0.7577365163572061

# Modeling(3. SVM)

In [64]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [78]:
poly_svm_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree = 3)),
    ("scaler", StandardScaler()),
    ("svm_clf", LinearSVC())
])

poly_svm_clf.fit(x_train, y_train)



Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('scaler', StandardScaler()), ('svm_clf', LinearSVC())])

In [82]:
sum(cross_val_score(poly_svm_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5



0.7268199233716475