task 1: Problem definetion

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

# Print basic statistics
print(df.describe())

# Draw histograms of main features
df[["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin", "BMI","DiabetesPedigreeFunction", "Age","Outcome"]].hist(bins=20, figsize=(10,8))
plt.show()

# Compute correlation between input/output variables
corr_matrix = df.corr()
print(corr_matrix)

# Visualize correlation matrix with heatmap
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()

task 2: apply scikit-learn model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and testing sets
X = df.drop("Outcome", axis=1)
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model on the training set
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

# Evaluate the performance on the testing set
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1 Score: {:.2f}%".format(f1*100))

task 3: IMPROVE YOUR MODEL’S PERFORMANCE

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and testing sets
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Use SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Preprocess the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select the top 5 most important features
selector = SelectKBest(f_classif, k=5)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Train a voting classifier ensemble using multiple models
model1 = DecisionTreeClassifier(random_state=42)
model2 = RandomForestClassifier(random_state=42)
model3 = GradientBoostingClassifier(random_state=42)
ensemble = VotingClassifier(estimators=[('dt', model1), ('rf', model2), ('gb', model3)])

ensemble.fit(X_train_selected, y_train)

# Evaluate the performance on the testing set
y_pred = ensemble.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1 Score: {:.2f}%".format(f1*100))

task 4(option 1): IMPLEMENT MODEL USING PYTHON

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the KNN class
class KNN:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            # Calculate the distance between the test point and each training point
            distances = np.sqrt(np.sum((self.X_train - x)**2, axis=1))
            
            # Find the k nearest neighbors and their corresponding distances
            nearest_neighbors = np.argsort(distances)[:self.k]
            
            # Assign the class label of the most frequent neighbor as the predicted label
            labels = self.y_train[nearest_neighbors]
            y_pred.append(np.bincount(labels).argmax())
        
        return np.array(y_pred)

# Split the data into training and testing sets
X = df.drop("Outcome", axis=1).values
y = df["Outcome"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data using standard scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a KNN object with k=3
knn = KNN(k=3)

# Train the KNN model on the training set
knn.fit(X_train, y_train)

# Make predictions on the testing set using the KNN model
y_pred = knn.predict(X_test)

# Evaluate the performance of the KNN model using the same evaluation metrics as in Task 2
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("KNN - Accuracy: {:.2f}%".format(accuracy*100))
print("KNN - Precision: {:.2f}%".format(precision*100))
print("KNN - Recall: {:.2f}%".format(recall*100))
print("KNN - F1 Score: {:.2f}%".format(f1*100))

# Train a logistic regression model on the training set using scikit-learn
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

# Make predictions on the testing set using the logistic regression model
y_pred_lr = lr.predict(X_test)

# Evaluate the performance of the logistic regression model using the same evaluation metrics as in Task 2
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print("Logistic Regression (scikit-learn) - Accuracy: {:.2f}%".format(accuracy_lr*100))
print("Logistic Regression (scikit-learn) - Precision: {:.2f}%".format(precision_lr*100))
print("Logistic Regression (scikit-learn) - Recall: {:.2f}%".format(recall_lr*100))
print("Logistic Regression (scikit-learn) - F1 Score: {:.2f}%".format(f1_lr*100))