In [1]:

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, fbeta_score
import numpy as np


In [2]:

# Load the dataset
data_path = '../data/diabetes_data.csv'
df = pd.read_csv(data_path, delimiter=';')

# Process categorical data
df = pd.get_dummies(df)

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']


In [3]:
df.head(5)

Unnamed: 0,age,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class,gender_Female,gender_Male
0,40,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1,False,True
1,58,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,False,True
2,41,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1,False,True
3,45,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1,False,True
4,60,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,False,True


In [4]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [5]:

# Define the model parameters
params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'class_weight': 'balanced',
    'random_state': 1
}

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier(**params)
clf.fit(X_train, y_train)


In [6]:

# Evaluate the model
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
fscore = fbeta_score(y_test, predictions, beta=0.5)

accuracy, fscore


(0.9711538461538461, 0.9730538922155688)

In [7]:
model_path = '../models/random_forest_model.pkl'
joblib.dump(clf, model_path)

['../models/random_forest_model.pkl']

: 