In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

### Reading and scalling the data

In [10]:
data_c = pd.read_parquet('./Local/2017_Clean/Combined.parquet')
data_f1 = pd.read_parquet('./Local/2017_Final/Combined_1.parquet')
data_f2 = pd.read_parquet('./Local/2017_Final/Combined_2.parquet')

datasets = {'Clean data': data_c, 'Final data 1': data_f1, 'Final data 2': data_f2}
matrices = {}

In [20]:
def preprocess_data(df):
    label_encoder = LabelEncoder()
    
    if 'Label' in df.columns:
        label_col = 'Label'
    elif ' Label' in df.columns:
        label_col = ' Label'
    else:
        raise ValueError("DataFrame does not contain a label column")
    
    df[label_col] = label_encoder.fit_transform(df[label_col])
    
    X = df.drop(label_col, axis=1)
    y = df[label_col]

    return X, y

processed_datasets = {name: preprocess_data(df) for name, df in datasets.items()}

### Running Naive Bayes for each dataset

In [22]:
for name, data in datasets.items():
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    gnb = GaussianNB()
    gnb.fit(X_train_scaled, y_train)
    
    print(f'--- {name} ---')
    print('Training Accuracy : ', metrics.accuracy_score(y_train, gnb.predict(X_train_scaled))*100)
    print('Validation Accuracy : ', metrics.accuracy_score(y_test, gnb.predict(X_test_scaled))*100)
    
    class_labels = y.unique()
    cm = metrics.confusion_matrix(y_test, gnb.predict(X_test_scaled))
    matrices[name] = cm
    
    class_labels_str = [str(label) for label in class_labels]
    cr = metrics.classification_report(y_test, gnb.predict(X_test_scaled), target_names=class_labels_str)
    print("Classification Report:")
    print(cr)
    print('\n')

--- Clean data ---
Training Accuracy :  71.08670470800423
Validation Accuracy :  71.14792586603645
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.67      0.80    429322
           2       0.00      1.00      0.01       408
          10       0.88      0.95      0.91     25511
           1       0.16      0.92      0.27      2025
           9       0.87      0.94      0.90     34698
          12       0.14      0.67      0.24      1017
          14       0.05      0.62      0.10      1113
          13       0.69      1.00      0.82      1182
           7       1.00      1.00      1.00         4
          11       0.00      0.86      0.00         7
           6       0.96      0.98      0.97     18185
           5       0.37      0.92      0.53       638
           4       0.00      0.07      0.01       289
           3       0.01      1.00      0.01         3
           8       0.24      0.91      0.38       126

    accuracy

ValueError: Unknown label type: (array([0.0000000e+00, 4.5112781e-02, 4.5801528e-02, ..., 3.9272622e+03,
       4.3706865e+03, 5.8005000e+03], dtype=float32),)

### Visualization