# Naive Bayes Classifier based on Term Frequencies

In [1]:
import pandas as pd

## 1. Load the Given data into a CSV File

To be done only once, later the data can be loaded on from the CSV file

In [2]:
# Create a Pandas dataframe and store the data

df = pd.DataFrame(columns=['TDP', 'Nifty', 'Sidhu', 'BJP', 'Sensex', 'Sixer', 'Congress', 'Century', 'Category'])
num_entries = 0

In [3]:
# The Given Data

data = [
    [4,0,3,5,1,0,6,0,'Politics'],
    [0,5,0,2,6,0,1,0,'Business'],
    [0,0,6,1,0,4,1,2,'Sports'],
    [4,1,0,1,1,0,6,0,'Politics'],
    [0,0,0,0,0,5,0,6,'Sports'],
    [0,4,0,2,6,0,0,1,'Business'],
    [5,0,0,3,0,0,5,0,'Politics']
]
query_data = [0,3,0,2,6,0,2,1]

In [4]:
# Load the data into a Pandas dataframe and store it into a CSV File

for i in range(len(data)) :
    df.loc[num_entries] = data[i]
    num_entries += 1

df.to_csv('Data.csv', index=None)

## 2. Loading the dataset and getting the Queryset

In [5]:
df = pd.read_csv('Data.csv')
query_data = [0,3,0,2,6,0,2,1]

In [6]:
# Get the output labels
output_labels = df['Category'].unique()

# Get the list of all words taken into consideration from the documents
words = list(df.columns)[:-1]

# Get the number of documents in the whole dataset
num_train_documents = df.shape[0]

## 3. Calculating the Required Probabilities

In [7]:
# A dictionary to store the conditional probabilities
# Format : conditional_probability[(a, b)] = P(a/b) => Probability of occurance of event `a` given that the event `b` has occured
conditional_probability = {}

# A dictionary to store the probabilities
# Format : probablity[a] = P(a) => Probablity of occurance of event `a`
probability = {}

### 3.1 Calculate the Probability of occurance of the output labels (classes)

In [8]:
for output_class in output_labels :
    temp_df = df.loc[df['Category'] == output_class]
    probability[output_class] = (temp_df.shape[0] / num_train_documents)

In [9]:
# Display the Probability of each output class

probability

{'Politics': 0.42857142857142855,
 'Business': 0.2857142857142857,
 'Sports': 0.2857142857142857}

### 3.2 Calculate the Conditional Probabilities

In [19]:
# Set parameter for smoothing

# ALPHA = 0 for no smoothing
ALPHA = 0

# ALPHA = 1 for Laplace Smoothing
#ALPHA = 1

In [20]:
for output_class in output_labels :
    temp_df = df.loc[df['Category'] == output_class]
    
    # Find the total number of words in that category
    total_word_count_in_category = 0
    for i in range(temp_df.shape[0]) :
        for word in words :
            total_word_count_in_category += temp_df.iloc[i][word]
    
    # For each word find the number of times it occurs in the current category output
    for word in words :
        current_word_count_in_category = 0
        for i in range(temp_df.shape[0]) :
            current_word_count_in_category += temp_df.iloc[i][word]
            
        # Store the conditional probability
        cur_prob = (current_word_count_in_category + ALPHA) / (total_word_count_in_category + (ALPHA * len(words)))
        conditional_probability[(word, output_class)] = cur_prob

In [12]:
# Without Smoothing, Some values are zero
print("Conditional Probabilities without applying any smoothing : \n")
conditional_probability

Conditional Probabilities without applying any smoothing : 



{('TDP', 'Politics'): 0.28888888888888886,
 ('Nifty', 'Politics'): 0.022222222222222223,
 ('Sidhu', 'Politics'): 0.06666666666666667,
 ('BJP', 'Politics'): 0.2,
 ('Sensex', 'Politics'): 0.044444444444444446,
 ('Sixer', 'Politics'): 0.0,
 ('Congress', 'Politics'): 0.37777777777777777,
 ('Century', 'Politics'): 0.0,
 ('TDP', 'Business'): 0.0,
 ('Nifty', 'Business'): 0.3333333333333333,
 ('Sidhu', 'Business'): 0.0,
 ('BJP', 'Business'): 0.14814814814814814,
 ('Sensex', 'Business'): 0.4444444444444444,
 ('Sixer', 'Business'): 0.0,
 ('Congress', 'Business'): 0.037037037037037035,
 ('Century', 'Business'): 0.037037037037037035,
 ('TDP', 'Sports'): 0.0,
 ('Nifty', 'Sports'): 0.0,
 ('Sidhu', 'Sports'): 0.24,
 ('BJP', 'Sports'): 0.04,
 ('Sensex', 'Sports'): 0.0,
 ('Sixer', 'Sports'): 0.36,
 ('Congress', 'Sports'): 0.04,
 ('Century', 'Sports'): 0.32}

In [21]:
# With Smoothing
print("Conditional Probabilities after applying smoothing : \n")
conditional_probability

Conditional Probabilities after applying smoothing : 



{('TDP', 'Politics'): 0.2641509433962264,
 ('Nifty', 'Politics'): 0.03773584905660377,
 ('Sidhu', 'Politics'): 0.07547169811320754,
 ('BJP', 'Politics'): 0.18867924528301888,
 ('Sensex', 'Politics'): 0.05660377358490566,
 ('Sixer', 'Politics'): 0.018867924528301886,
 ('Congress', 'Politics'): 0.33962264150943394,
 ('Century', 'Politics'): 0.018867924528301886,
 ('TDP', 'Business'): 0.02857142857142857,
 ('Nifty', 'Business'): 0.2857142857142857,
 ('Sidhu', 'Business'): 0.02857142857142857,
 ('BJP', 'Business'): 0.14285714285714285,
 ('Sensex', 'Business'): 0.37142857142857144,
 ('Sixer', 'Business'): 0.02857142857142857,
 ('Congress', 'Business'): 0.05714285714285714,
 ('Century', 'Business'): 0.05714285714285714,
 ('TDP', 'Sports'): 0.030303030303030304,
 ('Nifty', 'Sports'): 0.030303030303030304,
 ('Sidhu', 'Sports'): 0.21212121212121213,
 ('BJP', 'Sports'): 0.06060606060606061,
 ('Sensex', 'Sports'): 0.030303030303030304,
 ('Sixer', 'Sports'): 0.30303030303030304,
 ('Congress', 'Spo

## 4. Process the Query

In [13]:
# Convert the query array into a dictionary to index with the name of the word

query_dict = {}

for i, word in enumerate(words) :
    query_dict[word] = query_data[i]

In [14]:
query_dict

{'TDP': 0,
 'Nifty': 3,
 'Sidhu': 0,
 'BJP': 2,
 'Sensex': 6,
 'Sixer': 0,
 'Congress': 2,
 'Century': 1}

## 5. Find the Probability of the result

In [22]:
categorical_result_probability = {}

for output_class in output_labels :
    cur_prob = 1
    
    for word in words :
        cur_prob *= (conditional_probability[(word, output_class)] ** query_dict[word])
    
    categorical_result_probability[output_class] = cur_prob

In [16]:
print("Categorical scores without applying any smoothing : \n", categorical_result_probability)

Categorical scores without applying any smoothing : 
 {'Politics': 0.0, 'Business': 3.183041412870215e-10, 'Sports': 0.0}


In [23]:
print("Categorical scores after applying Laplace smoothing : \n", categorical_result_probability)

Categorical scores after applying Laplace smoothing : 
 {'Politics': 1.369305495406134e-16, 'Business': 2.3320480184727346e-10, 'Sports': 7.928067403337872e-20}


In [17]:
# Find the maximum probability

result_category = max(categorical_result_probability, key=categorical_result_probability.get)
result_score = categorical_result_probability[result_category]

In [18]:
print(f"The query entered belongs to the category : {result_category}")

The query entered belongs to the category : Business
