In [1]:
import numpy as np 
import pandas as pd

In [None]:
diabetes_df = pd.read_csv("diabetes.csv")
diabetes_df.head()

In [None]:
from sklearn.linear_model import Ridge, LinearRegression, Lasso, LogisticRegression
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, root_mean_squared_error, roc_curve, roc_auc_score

X = diabetes_df.drop(["Outcome"], axis=1).values
y = diabetes_df['Outcome'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=69)

X_train.shape, y_train.shape, X_test.shape, y_test.shape


In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
r_squard = reg.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print(r_squard)
print("mse: " + str(mse) + "\nrmse: " + str(rmse))

In [None]:
alphas = [0.0001, 0.1, 1, 10, 100]
scores = []
for alpha in alphas:
    # Ridge is used to prevent overfitting 
    reg = Ridge(alpha = alpha) # shrinks coefficients towards zero while keeping all features
    reg.fit(X_train, y_train)
    score = reg.score(X_test, y_test)
    scores.append(score)
print(scores)

In [None]:
#Lasso is also used for overfitting
reg = Lasso(alpha=.2)  # Shrinking coefficients to 0, removing irrelavant features (diff from Ridge)
reg.fit(X_train, y_train)
socre = reg.score(X_test, y_test)
print(score)

In [7]:
reg = LogisticRegression(max_iter=200) # is used for classification (yes/no), linear is used to actually model
reg.fit(X_train,y_train)
y_pred_prob = reg.predict_proba(X_test)[:, 1]

In [13]:
kf = KFold(n_splits=6, shuffle=True, random_state=69)
cv_scores = cross_val_score(reg, X, y, cv=kf) # corss value score is when there are multiple r_squared calcualted to reduce bias on data split
print("Corss Values Scores: " + str(cv_scores))
print("Mean: " + str(np.mean(cv_scores)))
print("Std: " + str(np.std(cv_scores)))
print("CI: " + str(np.quantile(cv_scores, [.025, .975])))

Corss Values Scores: [0.8046875 0.7265625 0.7421875 0.7890625 0.828125  0.75     ]
Mean: 0.7734375
Std: 0.03636520844731495
CI: [0.72851562 0.82519531]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1], "k--")
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()  


In [None]:
auc = roc_auc_score(y_test, y_pred_prob)
print(auc) #niceee, pretty high i think 