Comparition Between :/n Naive Bayes, Random Forest, SVM, Logistic Regression

In [1]:
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler

In [2]:
scaler = StandardScaler()

nb_model = GaussianNB()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(kernel='rbf', random_state=42)
lr_model = LogisticRegression(random_state=42)


In [3]:
def result(dataset_name,y_test,y_pred_nb,y_pred_rf,y_pred_svm,y_pred_lr,pos_label):
        result =   pd.DataFrame(  {
    'Model': ['Naive Bayes', 'Random Forest', 'SVM', 'Logistic Regression'],
    'Accuracy': [accuracy_score(y_test, y_pred_nb),
                 accuracy_score(y_test, y_pred_rf),
                 accuracy_score(y_test, y_pred_svm),
                 accuracy_score(y_test, y_pred_lr)],
    'Precision': [precision_score(y_test, y_pred_nb  ,pos_label=pos_label),
                  precision_score(y_test, y_pred_rf  ,pos_label=pos_label),
                  precision_score(y_test, y_pred_svm ,pos_label=pos_label),
                  precision_score(y_test, y_pred_lr  ,pos_label=pos_label)]
}
        )
        print(dataset_name +"Dataset Results:")
        print(result)

        # Determine the best model based on Accuracy
        best_accuracy_model = result.loc[result['Accuracy'].idxmax()]

        best_precision_model = result.loc[result['Precision'].idxmax()]

        # Display the best models
        print("\nBest Model based on Accuracy:")
        print(best_accuracy_model)

        print("\nBest Model based on Precision:")
        print(best_precision_model)


In [4]:

data_rice = arff.loadarff('Rice_Cammeo_Osmancik.arff')
rice_df = pd.DataFrame(data_rice[0])

# Convert byte strings to regular strings (if necessary)
rice_df['Class'] = rice_df['Class'].str.decode('utf-8')

# Define features and target
X_rice = rice_df.drop(columns=['Class'])
y_rice = rice_df['Class']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_rice, y_rice, test_size=0.3, random_state=42)

# Standardize the dataset

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Naive Bayes
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Random Forest
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# SVM
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Logistic Regression
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)


In [5]:
result(dataset_name="rice",y_test=y_test,y_pred_nb=y_pred_nb,y_pred_rf=y_pred_rf,y_pred_svm=y_pred_svm,y_pred_lr=y_pred_lr,pos_label='Cammeo')

riceDataset Results:
                 Model  Accuracy  Precision
0          Naive Bayes  0.927384   0.932406
1        Random Forest  0.926509   0.937500
2                  SVM  0.930884   0.941650
3  Logistic Regression  0.930009   0.938000

Best Model based on Accuracy:
Model             SVM
Accuracy     0.930884
Precision     0.94165
Name: 2, dtype: object

Best Model based on Precision:
Model             SVM
Accuracy     0.930884
Precision     0.94165
Name: 2, dtype: object


In [6]:
# 2

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

salary_df = pd.read_csv('adult.data', names=column_names, na_values="?")

salary_df = salary_df.dropna()

X_salary = salary_df.drop(columns=['income'])
y_salary = salary_df['income']

X_salary = pd.get_dummies(X_salary, drop_first=True)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_salary, y_salary, test_size=0.3, random_state=42)

# Standardize the dataset
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Naive Bayes
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Random Forest
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# SVM
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Logistic Regression
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [7]:
results_adult = {
    'Model': ['Naive Bayes', 'Random Forest', 'SVM', 'Logistic Regression'],
    'Accuracy': [accuracy_score(y_test, y_pred_nb),
                 accuracy_score(y_test, y_pred_rf),
                 accuracy_score(y_test, y_pred_svm),
                 accuracy_score(y_test, y_pred_lr)],
    'Precision': [precision_score(y_test, y_pred_nb,  pos_label=' >50K'),
                  precision_score(y_test, y_pred_rf,  pos_label=' >50K'),
                  precision_score(y_test, y_pred_svm, pos_label=' >50K'),
                  precision_score(y_test, y_pred_lr,  pos_label=' >50K')]
}

df_results_adult = pd.DataFrame(results_adult)
print("\nAdult Dataset Results:")
print(df_results_adult)

# Determine the best model based on Accuracy and Precision
best_accuracy_model_adult = df_results_adult.loc[df_results_adult['Accuracy'].idxmax()]
best_precision_model_adult = df_results_adult.loc[df_results_adult['Precision'].idxmax()]

print("\nBest Model for Adult Dataset based on Accuracy:")
print(best_accuracy_model_adult)

print("\nBest Model for Adult Dataset based on Precision:")
print(best_precision_model_adult)


Adult Dataset Results:
                 Model  Accuracy  Precision
0          Naive Bayes  0.388474   0.276502
1        Random Forest  0.856587   0.727681
2                  SVM  0.853107   0.758074
3  Logistic Regression  0.855154   0.736454

Best Model for Adult Dataset based on Accuracy:
Model        Random Forest
Accuracy          0.856587
Precision         0.727681
Name: 1, dtype: object

Best Model for Adult Dataset based on Precision:
Model             SVM
Accuracy     0.853107
Precision    0.758074
Name: 2, dtype: object


In [8]:
column_names_heart="survival ,still-alive, age-at-heart-attack, pericardial-effusion,fractional-shortening,epss ,lvdd, wall-motion-score, wall-motion-index, mult,name,group,alive-at-1 ".replace(" ","").split(",")

heart_df = pd.read_csv('echocardiogram.data',names=column_names_heart,na_values='?',delim_whitespace=False,on_bad_lines="skip")
heart_df=heart_df.drop(columns=['group','mult']).dropna()

X_heart = heart_df.drop(columns=['still-alive','alive-at-1','name'])
y_heart = heart_df['alive-at-1']

X_train,X_test,y_train,y_test = train_test_split(X_heart,y_heart,test_size=0.3)

  heart_df = pd.read_csv('echocardiogram.data',names=column_names_heart,na_values='?',delim_whitespace=False,on_bad_lines="skip")


In [9]:


# Naive Bayes
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Random Forest
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# SVM
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Logistic Regression
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)


In [10]:
results_heart = {
    'Model': ['Naive Bayes', 'Random Forest', 'SVM', 'Logistic Regression'],
    'Accuracy': [accuracy_score(y_test, y_pred_nb),
                 accuracy_score(y_test, y_pred_rf),
                 accuracy_score(y_test, y_pred_svm),
                 accuracy_score(y_test, y_pred_lr)],
    'Precision': [precision_score(y_test, y_pred_nb,  pos_label=0.0),
                  precision_score(y_test, y_pred_rf,  pos_label=0.0),
                  precision_score(y_test, y_pred_svm, pos_label=0.0),
                  precision_score(y_test, y_pred_lr,  pos_label=0.0)]
}

df_results_heart = pd.DataFrame(results_heart)
print("\nEchocardiogram Dataset Results:")
print(df_results_heart)

# Determine the best model based on Accuracy and Precision
best_accuracy_model_heart = df_results_heart.loc[df_results_heart['Accuracy'].idxmax()]
best_precision_model_heart = df_results_heart.loc[df_results_heart['Precision'].idxmax()]

print("\nBest Model for Echocardiogram Dataset based on Accuracy:")
print(best_accuracy_model_heart)

print("\nBest Model for Echocardiogram Dataset based on Precision:")
print(best_precision_model_heart)


Echocardiogram Dataset Results:
                 Model  Accuracy  Precision
0          Naive Bayes  0.947368   0.928571
1        Random Forest  1.000000   1.000000
2                  SVM  1.000000   1.000000
3  Logistic Regression  1.000000   1.000000

Best Model for Echocardiogram Dataset based on Accuracy:
Model        Random Forest
Accuracy               1.0
Precision              1.0
Name: 1, dtype: object

Best Model for Echocardiogram Dataset based on Precision:
Model        Random Forest
Accuracy               1.0
Precision              1.0
Name: 1, dtype: object
