1. Import


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

2. Dataset Preparation


In [None]:
df = pd.read_csv("diabetes.csv")

print(df.head())

3. Exploratory Data Analysis

In [None]:
df = pd.read_csv("diabetes.csv")

print("Descriptive Statistics:")
print(df.describe())

plt.figure(figsize=(12, 8))
sns.histplot(data=df, x='Glucose', bins=20, kde=True, color='blue', edgecolor='black')
plt.title('Distribution of Glucose Levels')
plt.xlabel('Glucose Level')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Age', y='BMI', hue='Outcome', palette='coolwarm')
plt.title('Age vs BMI with Outcome')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.legend(title='Outcome', loc='upper right')
plt.show()

4. Linear Regression Model

In [None]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)
r_squared = r2_score(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Linear Regression Model Evaluation:")
print("R-squared:", r_squared)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)


In [None]:
plt.scatter(y_test, y_pred, color='blue', label='Actual vs Predicted')

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', label='Ideal Line')

plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values in Linear Regression Model')
plt.legend()

plt.show()

5. Logistic Regression Model

In [None]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc_score = auc(fpr, tpr)

print("Logistic Regression Model Evaluation:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc_score)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


6. Model Comparison and Selection

In [None]:
Linear_R_squared = 0.25500281176741757
Linear_MSE = 0.17104527280850104
Linear_RMSE = 0.4135761995189049

Logistic_Accuracy = 0.7467532467532467
Logistic_Precision = 0.6379310344827587
Logistic_Recall = 0.6727272727272727
Logistic_F1_Score = 0.6548672566371682
Logistic_AUC = 0.7303030303030302

print("Model Comparison:")
print("Linear Regression:")
print(f"R-squared: {Linear_R_squared}")
print(f"Mean Squared Error (MSE): {Linear_MSE}")
print(f"Root Mean Squared Error (RMSE): {Linear_RMSE}")
print("\nLogistic Regression:")
print(f"Accuracy: {Logistic_Accuracy}")
print(f"Precision: {Logistic_Precision}")
print(f"Recall: {Logistic_Recall}")
print(f"F1 Score: {Logistic_F1_Score}")
print(f"AUC: {Logistic_AUC}")


7. Conclusion and Insights

In [10]:
print("Key Findings:")
print("- The Linear Regression model provided limited explanation power for the variance in the data, as evidenced by the low R-squared value.")
print("- In contrast, the Logistic Regression model demonstrated relatively good performance in predicting binary outcomes, with decent accuracy, precision, recall, F1 score, and AUC metrics.")
print("- The comparison between the two models suggests that the Logistic Regression model is more suitable for this particular prediction problem.")

print("\nPotential Applications:")
print("- Logistic Regression models can be applied in various fields such as healthcare for predicting disease outcomes, finance for credit risk assessment, and marketing for customer churn prediction.")
print("- Understanding the assumptions and limitations of these models is crucial for their effective application in real-world scenarios.")
print("- These models provide valuable insights that can inform decision-making processes and aid in the development of data-driven strategies.")

print("\nReflection:")
print("- The case study highlights the purposed of utilizing statistical and machine learning methods to analyze and interpret data.")
print("- It emphasizes the need to critically evaluate model performance and consider the suitability of different algorithms for specific prediction tasks.")
print("- By understanding the underlying assumptions and limitations of models, practitioners can make informed decisions and derive actionable insights from data.")


Key Findings:
- The Linear Regression model provided limited explanation power for the variance in the data, as evidenced by the low R-squared value.
- In contrast, the Logistic Regression model demonstrated relatively good performance in predicting binary outcomes, with decent accuracy, precision, recall, F1 score, and AUC metrics.
- The comparison between the two models suggests that the Logistic Regression model is more suitable for this particular prediction problem.

Potential Applications:
- Logistic Regression models can be applied in various fields such as healthcare for predicting disease outcomes, finance for credit risk assessment, and marketing for customer churn prediction.
- Understanding the assumptions and limitations of these models is crucial for their effective application in real-world scenarios.
- These models provide valuable insights that can inform decision-making processes and aid in the development of data-driven strategies.

Reflection:
- The case study highl

8. References

In [11]:
print("Datasets:")
print("- Diabetes dataset: [https://www.kaggle.com/datasets/shantanudhakadd/diabetes-dataset-for-beginners]")

print("\nLibraries:")
print("- pandas: https://pandas.pydata.org/")
print("- NumPy: https://numpy.org/")
print("- scikit-learn: https://scikit-learn.org/")
print("- matplotlib: https://matplotlib.org/")

print("\nAdditional Resources:")
print("- Lecture materials on Linear Regression and Logistic Regression")
print("- Online documentation and tutorials for implementing machine learning models")
print("- Research papers and articles related to diabetes prediction and classification")


Datasets:
- Diabetes dataset: [https://www.kaggle.com/datasets/shantanudhakadd/diabetes-dataset-for-beginners]

Libraries:
- pandas: https://pandas.pydata.org/
- NumPy: https://numpy.org/
- scikit-learn: https://scikit-learn.org/
- matplotlib: https://matplotlib.org/

Additional Resources:
- Lecture materials on Linear Regression and Logistic Regression
- Online documentation and tutorials for implementing machine learning models
- Research papers and articles related to diabetes prediction and classification
