In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the classifier on the training set
gnb.fit(X_train, y_train)

# Predict the labels on the testing set
y_pred = gnb.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.27011494252873564
Classification Report:
              precision    recall  f1-score   support

           1       0.44      0.41      0.42        17
           2       0.00      0.00      0.00        55
           3       0.43      0.17      0.24        35
           4       0.24      0.85      0.37        40
           5       0.00      0.00      0.00        27

    accuracy                           0.27       174
   macro avg       0.22      0.29      0.21       174
weighted avg       0.18      0.27      0.18       174



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Shuffle and split the data into training and testing sets
# The shuffle=True parameter ensures that the data is shuffled before splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the classifier on the training set
gnb.fit(X_train, y_train)

# Predict the labels on the testing set
y_pred = gnb.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.26436781609195403
Classification Report:
              precision    recall  f1-score   support

           1       0.25      0.32      0.28        19
           2       0.00      0.00      0.00        45
           3       0.56      0.22      0.32        41
           4       0.23      0.79      0.36        39
           5       0.00      0.00      0.00        30

    accuracy                           0.26       174
   macro avg       0.21      0.27      0.19       174
weighted avg       0.21      0.26      0.19       174



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Split the data into training and testing sets with shuffling and stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the classifier on the SMOTE training set
gnb.fit(X_train_smote, y_train_smote)

# Predict the labels on the testing set
y_pred = gnb.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.2471264367816092
Classification Report:
              precision    recall  f1-score   support

           1       0.20      0.63      0.31        19
           2       0.00      0.00      0.00        45
           3       0.80      0.10      0.17        41
           4       0.24      0.62      0.34        39
           5       0.38      0.10      0.16        30

    accuracy                           0.25       174
   macro avg       0.32      0.29      0.20       174
weighted avg       0.33      0.25      0.18       174



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
import numpy as np

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Apply SMOTE to the entire dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Define the K-fold cross-validation procedure
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform K-fold cross-validation
cv_scores = cross_val_score(gnb, X_smote, y_smote, cv=kfold, scoring='accuracy')

# Print the cross-validation scores and their mean
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean accuracy: {np.mean(cv_scores)}')

# Train the classifier on the entire dataset (after SMOTE)
gnb.fit(X_smote, y_smote)

# Predict the labels on the entire dataset
y_pred = gnb.predict(X)

# Evaluate the classifier
from sklearn.metrics import classification_report
report = classification_report(y, y_pred)

print('Classification Report:')
print(report)


Cross-validation scores: [0.26666667 0.28       0.28       0.32       0.34666667 0.22666667
 0.33333333 0.24       0.26666667 0.48      ]
Mean accuracy: 0.30399999999999994
Classification Report:
              precision    recall  f1-score   support

           1       0.20      0.62      0.30        63
           2       1.00      0.01      0.01       150
           3       0.63      0.13      0.21       135
           4       0.24      0.62      0.35       130
           5       0.42      0.08      0.13       100

    accuracy                           0.25       578
   macro avg       0.50      0.29      0.20       578
weighted avg       0.56      0.25      0.19       578



In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from imblearn.over_sampling import SMOTE
import numpy as np

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Apply SMOTE to the entire dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Initialize different Naive Bayes classifiers
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

# Define the K-fold cross-validation procedure
kfold = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

# Perform K-fold cross-validation for Gaussian Naive Bayes
gnb_cv_scores = cross_val_score(gnb, X_smote, y_smote, cv=kfold, scoring='accuracy')
print(f'Gaussian Naive Bayes Cross-validation scores: {gnb_cv_scores}')
print(f'Gaussian Naive Bayes Mean accuracy: {np.mean(gnb_cv_scores)}')

# Perform K-fold cross-validation for Multinomial Naive Bayes
mnb_cv_scores = cross_val_score(mnb, X_smote, y_smote, cv=kfold, scoring='accuracy')
print(f'Multinomial Naive Bayes Cross-validation scores: {mnb_cv_scores}')
print(f'Multinomial Naive Bayes Mean accuracy: {np.mean(mnb_cv_scores)}')

# Perform K-fold cross-validation for Bernoulli Naive Bayes
bnb_cv_scores = cross_val_score(bnb, X_smote, y_smote, cv=kfold, scoring='accuracy')
print(f'Bernoulli Naive Bayes Cross-validation scores: {bnb_cv_scores}')
print(f'Bernoulli Naive Bayes Mean accuracy: {np.mean(bnb_cv_scores)}')


Gaussian Naive Bayes Cross-validation scores: [0.28947368 0.23684211 0.31578947 0.23684211 0.26315789 0.31578947
 0.36842105 0.23684211 0.31578947 0.28947368 0.32432432 0.24324324
 0.2972973  0.2972973  0.27027027 0.27027027 0.35135135 0.27027027
 0.2972973  0.35135135]
Gaussian Naive Bayes Mean accuracy: 0.29206970128022763
Multinomial Naive Bayes Cross-validation scores: [0.26315789 0.28947368 0.36842105 0.28947368 0.36842105 0.36842105
 0.39473684 0.31578947 0.42105263 0.34210526 0.27027027 0.32432432
 0.2972973  0.2972973  0.27027027 0.2972973  0.40540541 0.24324324
 0.35135135 0.35135135]
Multinomial Naive Bayes Mean accuracy: 0.3264580369843528
Bernoulli Naive Bayes Cross-validation scores: [0.28947368 0.23684211 0.28947368 0.31578947 0.28947368 0.34210526
 0.28947368 0.31578947 0.28947368 0.23684211 0.35135135 0.2972973
 0.32432432 0.27027027 0.32432432 0.24324324 0.32432432 0.32432432
 0.35135135 0.32432432]
Bernoulli Naive Bayes Mean accuracy: 0.30149359886202


In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Apply SMOTE to balance the classes in the entire dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.25, random_state=90)

# Initialize the individual Naive Bayes classifiers
gnb = GaussianNB()
mnb = MultinomialNB()

# Create the voting classifier
voting_clf = VotingClassifier(estimators=[('gnb', gnb), ('mnb', mnb)], voting='hard')

# Train the voting classifier on the training set
voting_clf.fit(X_train, y_train)

# Predict the labels on the testing set
y_pred = voting_clf.predict(X_test)

# Evaluate the ensemble classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)



Accuracy: 0.39361702127659576
Classification Report:
              precision    recall  f1-score   support

           1       0.37      0.60      0.46        43
           2       0.39      0.81      0.53        43
           3       0.46      0.32      0.38        34
           4       0.00      0.00      0.00        30
           5       0.40      0.05      0.09        38

    accuracy                           0.39       188
   macro avg       0.32      0.36      0.29       188
weighted avg       0.34      0.39      0.31       188



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
# Apply SMOTE to balance the classes in the entire dataset
smote = SMOTE(random_state=84)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.25, random_state=84)

# Initialize individual Naive Bayes classifiers
gnb = GaussianNB()
mnb = MultinomialNB()

# Train Gaussian Naive Bayes classifier
gnb.fit(X_train, y_train)

# Train Multinomial Naive Bayes classifier
mnb.fit(X_train, y_train)

# Predict the labels on the testing set for both classifiers
y_pred_gnb = gnb.predict(X_test)
y_pred_mnb = mnb.predict(X_test)

# Evaluate the classifiers
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

report_gnb = classification_report(y_test, y_pred_gnb)
report_mnb = classification_report(y_test, y_pred_mnb)

print("Gaussian Naive Bayes Classifier:")
print(f'Accuracy: {accuracy_gnb}')
print('Classification Report:')
print(report_gnb)

print("\nMultinomial Naive Bayes Classifier:")
print(f'Accuracy: {accuracy_mnb}')
print('Classification Report:')
print(report_mnb)

Gaussian Naive Bayes Classifier:
Accuracy: 0.4521276595744681
Classification Report:
              precision    recall  f1-score   support

           1       0.32      0.50      0.39        38
           2       0.64      0.90      0.74        39
           3       0.36      0.29      0.32        31
           4       0.47      0.41      0.44        44
           5       0.36      0.11      0.17        36

    accuracy                           0.45       188
   macro avg       0.43      0.44      0.41       188
weighted avg       0.44      0.45      0.42       188


Multinomial Naive Bayes Classifier:
Accuracy: 0.3670212765957447
Classification Report:
              precision    recall  f1-score   support

           1       0.32      0.63      0.42        38
           2       0.37      0.85      0.52        39
           3       0.43      0.29      0.35        31
           4       0.00      0.00      0.00        44
           5       1.00      0.08      0.15        36

    accurac

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Read the cleaned CSV file
df = pd.read_csv('Dataset/NoInfValues.csv')

# Assuming the last column is the target variable
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

best_accuracy = 0
best_random_state = 0
best_test_size = 0

# Iterate over different values of random state and test set size
for random_state in range(42, 200):
    for test_size in [0.2, 0.25, 0.3]:
        # Apply SMOTE to balance the classes in the entire dataset
        smote = SMOTE(random_state=random_state)
        X_smote, y_smote = smote.fit_resample(X, y)

        # Split the balanced data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=test_size, random_state=random_state)

        # Initialize individual Naive Bayes classifiers
        gnb = GaussianNB()
        mnb = MultinomialNB()

        # Train Gaussian Naive Bayes classifier
        gnb.fit(X_train, y_train)

        # Train Multinomial Naive Bayes classifier
        mnb.fit(X_train, y_train)

        # Predict the labels on the testing set for both classifiers
        y_pred_gnb = gnb.predict(X_test)
        y_pred_mnb = mnb.predict(X_test)

        # Evaluate the classifiers
        accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
        accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

        # Calculate the average accuracy
        avg_accuracy = accuracy_gnb

        # Check if this combination of parameters gives the best accuracy
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_random_state = random_state
            best_test_size = test_size

# Print the best parameters and accuracy
print("Best Parameters:")
print(f"Random State: {best_random_state}")
print(f"Test Set Size: {best_test_size}")
print(f"Best Accuracy: {best_accuracy}")


Best Parameters:
Random State: 138
Test Set Size: 0.3
Best Accuracy: 0.5288888888888889


In [68]:
random_state = 138
test_size = 0.3

# Apply SMOTE to balance the classes in the entire dataset
smote = SMOTE(random_state=random_state)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=test_size, random_state=random_state)

# Initialize individual Naive Bayes classifiers
gnb = GaussianNB()
mnb = MultinomialNB()

# Train Gaussian Naive Bayes classifier
gnb.fit(X_train, y_train)

# Train Multinomial Naive Bayes classifier
mnb.fit(X_train, y_train)

# Predict the labels on the testing set for both classifiers
y_pred_gnb = gnb.predict(X_test)
y_pred_mnb = mnb.predict(X_test)

# Evaluate the classifiers
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

report_gnb = classification_report(y_test, y_pred_gnb)
report_mnb = classification_report(y_test, y_pred_mnb)

print("Gaussian Naive Bayes Classifier:")
print(f'Accuracy: {accuracy_gnb}')
print('Classification Report:')
print(report_gnb)

print("\nMultinomial Naive Bayes Classifier:")
print(f'Accuracy: {accuracy_mnb}')
print('Classification Report:')
print(report_mnb)

Gaussian Naive Bayes Classifier:
Accuracy: 0.5288888888888889
Classification Report:
              precision    recall  f1-score   support

           1       0.48      0.64      0.55        45
           2       0.69      0.92      0.79        52
           3       0.50      0.34      0.41        44
           4       0.41      0.48      0.44        48
           5       0.44      0.11      0.18        36

    accuracy                           0.53       225
   macro avg       0.50      0.50      0.47       225
weighted avg       0.51      0.53      0.49       225


Multinomial Naive Bayes Classifier:
Accuracy: 0.4088888888888889
Classification Report:
              precision    recall  f1-score   support

           1       0.40      0.78      0.53        45
           2       0.38      0.83      0.52        52
           3       0.56      0.32      0.41        44
           4       0.00      0.00      0.00        48
           5       0.00      0.00      0.00        36

    accurac

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
