In [3]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

### Reading and scalling the data

In [2]:
data_c = pd.read_parquet('./Local/2017_Clean/Combined.parquet')
data_f1 = pd.read_parquet('./Local/2017_Final/Combined_1.parquet')
data_f2 = pd.read_parquet('./Local/2017_Final/Combined_2.parquet')

datasets = {'Clean data': data_c, 'Final data 1': data_f1, 'Final data 2': data_f2}

In [4]:
def preprocess_data(df):
    label_encoder = LabelEncoder()
    
    if 'Label' in df.columns:
        label_col = 'Label'
    elif ' Label' in df.columns:
        label_col = ' Label'
    else:
        raise ValueError("DataFrame does not contain a label column")
    
    df[label_col] = label_encoder.fit_transform(df[label_col])
    
    X = df.drop(label_col, axis=1)
    y = df[label_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

processed_datasets = {name: preprocess_data(df) for name, df in datasets.items()}

### Running Naive Bayes for each dataset

In [34]:
for name, data in datasets.items():
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)

    print(f'--- {name} ---')
    print('Training Accuracy : ', metrics.accuracy_score(y_train, gnb.predict(X_train))*100)
    print('Validation Accuracy : ', metrics.accuracy_score(y_test, gnb.predict(X_test))*100)

    class_labels = y.unique()
    cm = metrics.confusion_matrix(y_test, gnb.predict(X_test))
    confusion_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)

    print("Confusion Matrix:")
    print(confusion_df)

    class_labels_str = [str(label) for label in class_labels]
    cr = metrics.classification_report(y_test, gnb.predict(X_test), target_names=class_labels_str)
    print("Classification Report:")
    print(cr)
    print('\n')

--- Clean data ---
Training Accuracy :  17.77191911810436
Validation Accuracy :  17.7897024068661
Confusion Matrix:
       0      2      10     1      9      12     14      13  7    11      6   \
0   66825  16537    204  28490   4271  29245  46752  168128  50  510  444590   
2       0     27      0      0      0      0      0       0   0    0     761   
10      0      0  16290     10  16532      0      0   18749   0    0       1   
1      10      0      0   1018   1657      9    126    1022   0    0       0   
9       0      0     83   1311  60786      0      0    7123   0    0       0   
12      1      0    102     12      7    102    578    1301   0    0       0   
14      0      0      0      1      0     52    832    1183   0    0       0   
13      0      0      0      0      0      0      0    2371   0    0       0   
7       0      0      0      0      0      0      0       0   5    0       0   
11      0      5      0      0      0      0      0       0   0    5       3   
6   