In [59]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [60]:
# Import Data into dataframe
df = pd.read_csv("~/Xtern2023/XTern 2024 Artificial Intelegence Data Set - Xtern_TrainData.csv")

In [61]:
# Calculate P(xi | yj) for each xi in X and yj in y
conditional_probabilities = {}

# Loop through each column (feature) except the 'Play Golf' column
for column in df.columns[:-1]:
    # Calculate conditional probabilities P(xi | yj) for each xi and yj
    for xi in df[column].unique():
        for yj in df['Order'].unique():
            p_xi_given_yj = len(df[(df[column] == xi) & (df['Order'] == yj)]) / len(df[df['Order'] == yj])
            conditional_probabilities[(xi, yj, column)] = p_xi_given_yj

# Display the conditional probabilities table
conditional_probabilities_df = pd.DataFrame(list(conditional_probabilities.items()), columns=['(xi, yj, column)', 'P(xi | yj)'])

print(conditional_probabilities_df)

                                      (xi, yj, column)  P(xi | yj)
0                 (Year 2, Fried Catfish Basket, Year)    0.808163
1                      (Year 2, Sugar Cream Pie, Year)    0.107422
2                   (Year 2, Indiana Pork Chili, Year)    0.839216
3    (Year 2, Indiana Corn on the Cob (brushed with...    0.076768
4    (Year 2, Indiana Buffalo Chicken Tacos (3 taco...    0.211694
..                                                 ...         ...
435                      (8, Sweet Potato Fries, Time)    0.000000
436  (8, Ultimate Grilled Cheese Sandwich (with bac...    0.000000
437        (8, Breaded Pork Tenderloin Sandwich, Time)    0.012146
438                  (8, Cornbread Hush Puppies, Time)    0.001961
439        (8, Hoosier BBQ Pulled Pork Sandwich, Time)    0.000000

[440 rows x 2 columns]


In [62]:
# Sorting the Conditional Probabilities Table by P(xi | yj)
conditional_probabilities_df = conditional_probabilities_df.sort_values('P(xi | yj)', ascending=False)

print(conditional_probabilities_df)

                                      (xi, yj, column)  P(xi | yj)
13   (Year 3, Indiana Corn on the Cob (brushed with...    0.923232
18              (Year 3, Cornbread Hush Puppies, Year)    0.894118
11                     (Year 3, Sugar Cream Pie, Year)    0.892578
19    (Year 3, Hoosier BBQ Pulled Pork Sandwich, Year)    0.852697
2                   (Year 2, Indiana Pork Chili, Year)    0.839216
..                                                 ...         ...
322  (Indiana University Bloomington, Indiana Pork ...    0.000000
323  (Indiana University Bloomington, Indiana Corn ...    0.000000
324  (Indiana University Bloomington, Indiana Buffa...    0.000000
325  (Indiana University Bloomington, Sweet Potato ...    0.000000
439        (8, Hoosier BBQ Pulled Pork Sandwich, Time)    0.000000

[440 rows x 2 columns]


In [103]:
# Converting Columns Into Integers

# Creating instance of label converter
lab = LabelEncoder()

# Encoding labels for year
df_encoded = df
df_encoded['Year'] = lab.fit_transform(df_encoded['Year'])
df_encoded['Major'] = lab.fit_transform(df_encoded['Major'])
df_encoded['University'] = lab.fit_transform(df_encoded['University'])
df_encoded['Order'] = lab.fit_transform(df_encoded['Order'])

In [104]:
# Breaking up our dataframe into x and y
x = df_encoded.drop(['Order'], axis = 1)
y = df_encoded.Order.values

In [105]:
# Splitting the Dataset into Training and Testing Sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 2)

In [106]:
# Performing Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [107]:
# Result of Gaussian Naive Bayes Model
print("Gaussian Naive Bayes score: ", gnb.score(x_test, y_test))

Gaussian Naive Bayes score:  0.4686666666666667


In [108]:
# Improvements
# GaussianNB is more for continuous numbers, but we are working with categories. Let's try CategoricalNB
cnb = CategoricalNB()
cnb.fit(x_train, y_train)

In [109]:
# Result of Categorical Naive Bayes Model
print("Categorical Naive Bayes score: ", cnb.score(x_test, y_test))

maxScore = 0
maxIndex = 0
for i in range(4, 1000):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = i)
    
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    
    cnb = CategoricalNB()
    cnb.fit(x_train, y_train)
    
    print(gnb.score(x_test, y_test))
    print(cnb.score(x_test, y_test))
    print(i)

Categorical Naive Bayes score:  0.604
0.486
0.612
4
0.4646666666666667
0.5973333333333334
5
0.482
0.598
6
0.4653333333333333
0.5953333333333334
7
0.4786666666666667
0.586
8
0.48533333333333334
0.5986666666666667
9
0.47933333333333333
0.6126666666666667
10
0.5006666666666667
0.6153333333333333
11
0.486
0.606
12
0.4646666666666667
0.5866666666666667
13
0.48933333333333334
0.616
14
0.49066666666666664
0.614
15
0.496
0.6126666666666667
16
0.488
0.5973333333333334
17
0.47533333333333333
0.602
18
0.45466666666666666
0.5653333333333334
19
0.49733333333333335
0.5933333333333334
20
0.47533333333333333
0.5966666666666667
21
0.4693333333333333
0.598
22
0.4706666666666667
0.6026666666666667
23
0.4706666666666667
0.5946666666666667
24
0.4866666666666667
0.6
25
0.4686666666666667
0.604
26
0.464
0.6073333333333333
27
0.4886666666666667
0.6106666666666667
28
0.4686666666666667
0.5853333333333334
29
0.4846666666666667
0.6013333333333334
30
0.47
0.606
31
0.48533333333333334
0.6193333333333333
32
0.484
0

IndexError: index 3 is out of bounds for axis 1 with size 3

In [12]:
# This is pretty terrible, so let's try this using one hot encoding!
# Import Data into dataframe
df = pd.read_csv("~/Xtern2023/XTern 2024 Artificial Intelegence Data Set - Xtern_TrainData.csv")

In [13]:
# Changing labels from objects to floats using One Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore')

# Encoding University, Year and Major
encoded_uni_df = pd.DataFrame(encoder.fit_transform(df[['University']]).toarray())
encoded_year_df = pd.DataFrame(encoder.fit_transform(df[['Year']]).toarray())
encoded_major_df = pd.DataFrame(encoder.fit_transform(df[['Major']]).toarray())

# Drop original Year and Major columns
one_hot_df = df.drop(['Year', 'Major', 'University'], axis=1)

# Adding new columns back into df
one_hot_df = pd.concat([one_hot_df, encoded_uni_df, encoded_year_df, encoded_major_df], axis=1)

# Creating instance of label converter
lab = LabelEncoder()
one_hot_df['Order'] = lab.fit_transform(one_hot_df['Order'])

# Making Sure All Column Names are Strings
one_hot_df.columns = one_hot_df.columns.astype(str)

one_hot_df.head()


Unnamed: 0,Time,Order,0,1,2,3,4,5,6,7,...,10,11,12,13,14,15,16,17,18,19
0,12,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,14,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Performing Naive Bayes Again, but with our new One Hot Encoded DataFrame!

# Breaking up our dataframe into x and y
x = one_hot_df.drop(['Order'], axis = 1)
y = one_hot_df['Order']

print(x)
print(y)

      Time    0    1    2    3    4    5    6    7    8  ...   10   11   12  \
0       12  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1       14  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2       12  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3       11  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4       12  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
4995    11  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  1.0   
4996    12  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4997    13  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4998    15  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4999    15  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   

       13   14   15   16   17   18   19  
0     0.0

In [15]:
# Splitting the Dataset into Training and Testing Sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 50)

In [16]:
# Performing Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [17]:
# Result of Naive Bayes Model
print("Naive Bayes score using One-Hot Encoding: ", gnb.score(x_test, y_test))

Naive Bayes score using One-Hot Encoding:  0.24333333333333335
