In [1]:
# Install any necessary libraries for mySQL connection or model training

import sys
#!{sys.executable} -m pip install mysql-connector-python

# Install sqlalchemy for pandas to sql conversion
#!{sys.executable} -m pip install sqlalchemy

# Install imbalanced learning library
#!{sys.executable} -m pip install imbalanced-learn

# Install xGBoost library
#!{sys.executable} -m pip install xgboost

In [2]:
# Checking install version

#!{sys.executable} -m pip show mysql-connector-python

In [3]:
# Importing libraries

import mysql.connector
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt
from sklearn.metrics import mean_absolute_error
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
import pickle

In [4]:
# Connect to mySQL server

# On local device
config = {
    'user': 'root',
    'password': '1qazO1r2a3c4l5e6xsw2123!',
    'host': 'localhost',
    'database': 'ra_classification'
}
# External device
#config = {
#    'user:': 'Alison',
#    'password': 'X5T?2M-Zf>7cCYtaE.fP'
#    'host': '192.168.2.35',
#    'port': 3306
#    'database': 'ra_classification'
#}

# Establish the connection
mydb = mysql.connector.connect(**config)

# Checking connection
print(mydb)

<mysql.connector.connection_cext.CMySQLConnection object at 0x000001ED771DEC70>


In [5]:
# Function to fetch database data

def getData(tableName):
    
    """
    Fetches data from database table tableName.
    
    Parameters:
    - tableName (string): name of the database table to read.
    """
    
    # Create a cursor to execute queries
    mycursor = mydb.cursor()
    
    # Select table
    query = f"SELECT * FROM {tableName}"
    mycursor.execute(query)
    
    # Get data
    result = mycursor.fetchall()
    
    # Get column names
    columnNames = [desc[0] for desc in mycursor.description]
    
    # Create pandas DataFrame for handling
    df = pd.DataFrame(result, columns=columnNames)
    
    # Close the cursor
    mycursor.close()
    
    return df

In [6]:
# Function to normalize data

from sklearn.preprocessing import MinMaxScaler

def normalizeData(pd):
    """
    Normalizes input data to ensure features have similar scales.
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    """
    
    # MinMaxScaler object
    minMaxScaler = MinMaxScaler()
    
    pd['Age'] = minMaxScaler.fit_transform(pd[['Age']])

In [7]:
# Training with Linear SVC

from sklearn.svm import SVC

def evaluateSVC(pd):
    """
    Evaluates input data with linear SVC model using oversampling
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - mean_score (float): Mean absolute error computed from cross-validation
    - mean_F1 (float): Mean F1 scored computed from cross-validation
    - model (RandomForestClassifier): Trained Random Forest model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    # Initialize SMOTE for oversampling
    smote = SMOTE(random_state=42)
    
    # Initialize Leave-One-Out Cross-Validation
    cv = LeaveOneOut()
    
    # Initialize an empty list to store mean error scores
    scores = []
    
    # Initialize an empty list to store F1 scores
    f1_scores = []
    
    # Loop through each fold of Leave-One-Out Cross-Validation
    for train_index, test_index in cv.split(X, y):
        # Split data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Perform oversampling on the training data using SMOTE
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
        
        # Build linear SVC model
        model = SVC(kernel='linear', gamma=0.1, C=2)
        
        # Train the model on the oversampled training data
        model.fit(X_train_oversampled, y_train_oversampled)
        
        # Calculate F1 score for SVC model
        f1_scores.append(f1_score(y_test, model.predict(X_test)))
        
        # Evaluate the model on the test data
        y_pred = model.predict(X_test)
        
        # Calculate mean absolute error for this fold
        mae = mean_absolute_error(y_test, y_pred)
        
        # Append the score to the list of scores
        scores.append(mae)
    
    # Compute the mean of all scores
    mean_score = sum(scores) / len(scores)
    mean_F1 = np.mean(f1_scores)
    
    return mean_score, mean_F1, model

In [8]:
# Training with Random Forest

from sklearn.ensemble import RandomForestClassifier

def evaluateRandomForest(pd):
    """
    Evaluates input data with Random Forest model using oversampling
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - mean_score (float): Mean absolute error computed from cross-validation
    - mean_F1 (float): Mean F1 scored computed from cross-validation
    - model (RandomForestClassifier): Trained Random Forest model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    # Initialize SMOTE for oversampling
    smote = SMOTE(random_state=42)
    
    # Initialize Leave-One-Out Cross-Validation
    cv = LeaveOneOut()
    
    # Initialize an empty list to store scores
    scores = []
    
    # Initialize an empty list to store F1 scores
    f1_scores = []
    
    # Loop through each fold of Leave-One-Out Cross-Validation
    for train_index, test_index in cv.split(X, y):
        # Split data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Perform oversampling on the training data using SMOTE
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
        
        # Build Random Forest model
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        
        # Train the model on the oversampled training data
        model.fit(X_train_oversampled, y_train_oversampled)
        
        # Calculate F1 score for Random Forest model
        f1_scores.append(f1_score(y_test, model.predict(X_test)))
        
        # Evaluate the model on the test data
        y_pred = model.predict(X_test)
        
        # Calculate mean absolute error for this fold
        mae = mean_absolute_error(y_test, y_pred)
        
        # Append the score to the list of scores
        scores.append(mae)
    
    # Compute the mean of all scores
    mean_score = sum(scores) / len(scores)
    mean_F1 = np.mean(f1_scores)
    
    return mean_score, mean_F1, model

In [9]:
# Training with Decision Tree

from sklearn.tree import DecisionTreeClassifier

def evaluateDecisionTree(pd):
    """
    Evaluates input data with Decision Tree model using oversampling
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - mean_score (float): Mean absolute error computed from cross-validation
    - mean_F1 (float): Mean F1 scored computed from cross-validation
    - model (DecisionTreeClassifier): Trained Decision Tree model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    # Initialize SMOTE for oversampling
    smote = SMOTE(random_state=42)
    
    # Initialize Leave-One-Out Cross-Validation
    cv = LeaveOneOut()
    
    # Initialize an empty list to store scores
    scores = []
    
    # Initialize an empty list to store F1 scores
    f1_scores = []
    
    # Loop through each fold of Leave-One-Out Cross-Validation
    for train_index, test_index in cv.split(X, y):
        # Split data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Perform oversampling on the training data using SMOTE
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
        
        # Build Decision Tree model
        model = DecisionTreeClassifier(random_state=42)
        
        # Train the model on the oversampled training data
        model.fit(X_train_oversampled, y_train_oversampled)
        
        # Calculate F1 score for Decision Tree model
        f1_scores.append(f1_score(y_test, model.predict(X_test)))
        
        # Evaluate the model on the test data
        y_pred = model.predict(X_test)
        
        # Calculate mean absolute error for this fold
        mae = mean_absolute_error(y_test, y_pred)
        
        # Append the score to the list of scores
        scores.append(mae)
    
    # Compute the mean of all scores
    mean_score = sum(scores) / len(scores)
    mean_F1 = np.mean(f1_scores)
    
    return mean_score, mean_F1, model

In [10]:
# Training with AdaBoost

from sklearn.ensemble import AdaBoostClassifier

def evaluateAdaBoost(pd):
    """
    Evaluates input data with AdaBoost model using oversampling
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - mean_score (float): Mean absolute error computed from cross-validation
    - mean_F1 (float): Mean F1 scored computed from cross-validation
    - model (AdaBoostClassifier): Trained AdaBoost model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    # Initialize SMOTE for oversampling
    smote = SMOTE(random_state=42)
    
    # Initialize Leave-One-Out Cross-Validation
    cv = LeaveOneOut()
    
    # Initialize an empty list to store scores
    scores = []
    
    # Initialize an empty list to store F1 scores
    f1_scores = []
    
    # Loop through each fold of Leave-One-Out Cross-Validation
    for train_index, test_index in cv.split(X, y):
        # Split data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Perform oversampling on the training data using SMOTE
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
        
        # Build AdaBoost model
        model = AdaBoostClassifier(n_estimators=100, random_state=42)
        
        # Train the model on the oversampled training data
        model.fit(X_train_oversampled, y_train_oversampled)
        
        # Calculate F1 score for AdaBoost model
        f1_scores.append(f1_score(y_test, model.predict(X_test)))
        
        # Evaluate the model on the test data
        y_pred = model.predict(X_test)
        
        # Calculate mean absolute error for this fold
        mae = mean_absolute_error(y_test, y_pred)
        
        # Append the score to the list of scores
        scores.append(mae)
    
    # Compute the mean of all scores
    mean_score = sum(scores) / len(scores)
    mean_F1 = np.mean(f1_scores)
    
    return mean_score, mean_F1, model

In [11]:
# Training model with Naive Bayes

from sklearn.naive_bayes import GaussianNB

def evaluateNaiveBayes(pd):
    """
    Evaluates input data with Naive Bayes model using oversampling
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - mean_score (float): Mean absolute error computed from cross-validation
    - mean_F1 (float): Mean F1 scored computed from cross-validation
    - model (GaussianNB): Trained Naive Bayes model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    # Initialize SMOTE for oversampling
    smote = SMOTE(random_state=42)
    
    # Initialize Leave-One-Out Cross-Validation
    cv = LeaveOneOut()
    
    # Initialize an empty list to store mean error scores
    scores = []
    
    # Initialize an empty list to store F1 scores
    f1_scores = []
    
    # Loop through each fold of Leave-One-Out Cross-Validation
    for train_index, test_index in cv.split(X, y):
        # Split data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Perform oversampling on the training data using SMOTE
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
        
        # Build and train Naive Bayes model
        model = GaussianNB()
        model.fit(X_train_oversampled, y_train_oversampled)
        
        # Calculate F1 score for Naive Bayes model
        f1_scores.append(f1_score(y_test, model.predict(X_test)))
        
        # Evaluate the model on the test data
        y_pred = model.predict(X_test)
        
        # Calculate mean absolute error for this fold
        mae = mean_absolute_error(y_test, y_pred)
        
        # Append the score to the list of scores
        scores.append(mae)
    
    # Compute the mean of all scores
    mean_score = sum(scores) / len(scores)
    mean_F1 = np.mean(f1_scores)
    
    return mean_score, mean_F1, model

In [12]:
# Training model with xGBoost

from xgboost import XGBClassifier

def evaluateXGBoost(pd):
    """
    Evaluates input data with XGBoost model using oversampling
    
    Parameters:
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - mean_score (float): Mean absolute error computed from cross-validation
    - mean_F1 (float): Mean F1 scored computed from cross-validation
    - model (XGBClassifier): Trained XGBoost model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    # Initialize SMOTE for oversampling
    smote = SMOTE(random_state=42)
    
    # Initialize Leave-One-Out Cross-Validation
    cv = LeaveOneOut()
    
    # Initialize an empty list to store mean error scores
    scores = []
    
    # Initialize an empty list to store F1 scores
    f1_scores = []
    
    # Loop through each fold of Leave-One-Out Cross-Validation
    for train_index, test_index in cv.split(X, y):
        # Split data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Perform oversampling on the training data using SMOTE
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
        
        # Build and train XGBoost model
        model = XGBClassifier()
        model.fit(X_train_oversampled, y_train_oversampled)
        
        # Calculate F1 score for XGBoost model
        f1_scores.append(f1_score(y_test, model.predict(X_test)))
        
        # Evaluate the model on the test data
        y_pred = model.predict(X_test)
        
        # Calculate mean absolute error for this fold
        mae = mean_absolute_error(y_test, y_pred)
        
        # Append the score to the list of scores
        scores.append(mae)
    
    # Compute the mean of all scores
    mean_score = sum(scores) / len(scores)
    mean_F1 = np.mean(f1_scores)
    
    return mean_score, mean_F1, model

In [13]:
# Building, testing and evaluating models

# Getting data from the table 'combined_set' in the 'ra_classification' database
pd1 = getData('combined_set')

# Calling normalize function
#normalizeData(pd1)

# Display if you want to see the tabulated data
#display(pd1)

# Building and evaluating models
result_SVC, F1_SVC, model_SVC = evaluateSVC(pd1)
result_RF, F1_RF, model_RF = evaluateRandomForest(pd1)
result_DT, F1_DT, model_DT = evaluateDecisionTree(pd1)
result_AB, F1_AB, model_AB = evaluateAdaBoost(pd1)
result_NB, F1_NB, model_NB = evaluateNaiveBayes(pd1)
result_xGB, F1_xGB, model_xGB = evaluateXGBoost(pd1)

# Printing results
print("Linear SVC results: ", result_SVC)
print("Random Forest results: ", result_RF)
print("Decision Tree results: ", result_DT)
print("AdaBoost results: ", result_AB)
print("Naive Bayes results: ", result_NB)
print("xGBoost results: ", result_xGB)

print("F1 scores")
print("Linear SVC F1 score: ", F1_SVC)
print("Random Forest F1 score: ", F1_RF)
print("Decision Tree F1 score: ", F1_DT)
print("AdaBoost F1 score: ", F1_AB)
print("Naive Bayes F1 score: ", F1_AB)
print("xGBoost F1 score: ", F1_AB)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

Linear SVC results:  0.16449086161879894
Random Forest results:  0.19843342036553524
Decision Tree results:  0.206266318537859
AdaBoost results:  0.1618798955613577
Naive Bayes results:  0.45430809399477806
xGBoost results:  0.20365535248041775
F1 scores
Linear SVC F1 score:  0.7571801566579635
Random Forest F1 score:  0.7049608355091384
Decision Tree F1 score:  0.6971279373368147
AdaBoost F1 score:  0.7493472584856397
Naive Bayes F1 score:  0.7493472584856397
xGBoost F1 score:  0.7493472584856397


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [14]:
# Function to get accuracy and false negative metrics

from sklearn.metrics import confusion_matrix, classification_report

def modelMetrics(model, pd):
    """
    Takes a model and the data and outputs corresponding accuracy and false negative rates.
    
    Parameters:
    - model (machine learning model object): The machine learning model to evaluate
    - pd (pandas DataFrame): DataFrame containing input data
    
    Returns:
    - accuracy (float): Accuracy of the model
    - FNRate (float): False negative rate of the model
    """
    
    X = pd[['Age', 'Sex', 'Smoking_Status']]
    y = pd['RA_Status']
    
    preds = model.predict(X)
    conf_matrix = confusion_matrix(preds,y)
    
    TP = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]
    TN = conf_matrix[1][1]
    
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    FNRate = FN / (TN + FP)
    
    return accuracy, FNRate

In [15]:
# Getting accuracy and false negative rate metrics

models = {"Linear SVC": model_SVC, "Random Forest": model_RF, "Decision Tree": model_DT, "AdaBoost": model_AB,
          "xGBoost": model_xGB, "Naive Bayes": model_NB}

for model_name, model in models.items():
    accuracy, FN = modelMetrics(model, pd1)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print(f"False Negative Rate: {FN}")
    print("-" * 20)

Model: Linear SVC
Accuracy: 0.835509138381201
False Negative Rate: 0.06646525679758308
--------------------
Model: Random Forest
Accuracy: 0.8302872062663186
False Negative Rate: 0.021148036253776436
--------------------
Model: Decision Tree
Accuracy: 0.8302872062663186
False Negative Rate: 0.015105740181268883
--------------------
Model: AdaBoost
Accuracy: 0.7989556135770235
False Negative Rate: 0.02416918429003021
--------------------
Model: xGBoost
Accuracy: 0.8302872062663186
False Negative Rate: 0.02416918429003021
--------------------
Model: Naive Bayes
Accuracy: 0.5248041775456919
False Negative Rate: 0.015105740181268883
--------------------


In [16]:
# Predicting with model using custom input

# Assuming you have a new input data point represented as a dictionary
new_data_point = {
    'Age': 15,                  # Example age value
    'Sex': 0,                   # Example sex value (1 for female)
    'Smoking_Status': 1,        # Example smoking status (0 for non-smoker)
}

# Convert the input data point into a pandas DataFrame
new_data_df = pd.DataFrame([new_data_point])

# Use the trained model to make predictions on the new data
model = model_xGB # Select model
predicted_RA_Status = model.predict(new_data_df[['Age', 'Sex', 'Smoking_Status']])

# Print the predicted RA_Status
print("Predicted RA Status:", predicted_RA_Status)

Predicted RA Status: [0]


In [17]:
# Comparing accuracy and false negative rate to model without SMOTE

X = pd1[['Age', 'Sex', 'Smoking_Status']]
y = pd1['RA_Status']

#oldModel = SVC(kernel='linear', gamma=0.1, C=2)
#oldModel.fit(X,y)
oldModel = XGBClassifier()
oldModel.fit(X,y)

oldAccuracy, oldFNRate = modelMetrics(oldModel,pd1)

print("Accuracy:", oldAccuracy)
print("False Negative Rate:", oldFNRate)

Accuracy: 0.9007832898172323
False Negative Rate: 0.08157099697885196


In [18]:
# Getting model

filename = 'xGBoost_model.sav'
pickle.dump(model_xGB, open(filename, 'wb'))

In [None]:
# Close database when finished
mydb.close()