In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Question 1

# Implement in python program of the following problems using Bayes Theorem.
# a) Of the students in the college, 60% of the students reside in the hostel and 40% of the students are day
# scholars. Previous year results report that 30% of all students who stay in the hostel scored A Grade and 20%
# of day scholars scored A grade. At the end of the year, one student is chosen at random and found that he/she
# has an A grade. What is the probability that the student is a hosteler?
# b) Suppose you're testing for a rare disease, and you have the following information:
#  The disease has a prevalence of 0.01 (1% of the population has the disease).
#  The test is not perfect:
#  The test correctly identifies the disease (true positive) 99% of the time (sensitivity).
#  The test incorrectly indicates the disease (false positive) 2% of the time (1 - specificity).
# Calculate the probability of having the disease given a positive test result using Bayes' theorem.

In [5]:
prob_hosteler = 0.60
prob_day_scholar = 0.40
prob_A_given_hosteler = 0.30
prob_A_given_day_scholar = 0.20

prob_A = prob_A_given_hosteler * prob_hosteler + prob_A_given_day_scholar * prob_day_scholar

prob_hosteler_given_A = (prob_A_given_hosteler * prob_hosteler) / prob_A

print(f"Probability: {prob_hosteler_given_A:.4f}")

Probability: 0.6923


In [6]:
prob_disease = 0.01
prob_no_disease = 0.99
prob_positive_given_disease = 0.99
prob_positive_given_no_disease = 0.02

prob_positive = prob_positive_given_disease * prob_disease + prob_positive_given_no_disease * prob_no_disease

prob_disease_given_positive = (prob_positive_given_disease * prob_disease) / prob_positive

print(f"Probability: {prob_disease_given_positive:.4f}")


Probability: 0.3333


In [7]:
# Question 2

# Develop a function python code for Naïve Bayes classifier from scratch without using scikit-learn library,
# to predict whether the buyer should buy computer or not. Consider a following sample training dataset stored
# in a CSV file containing information about following buyer conditions (such as “<=30,” “medium,” “Yes,”
# and “fair”) and whether the player played golf (“Yes” or “No”).

In [25]:
def fit(X, y):
    classes = y.unique()
    class_probs = y.value_counts(normalize=True).to_dict()

    feature_probs = {}
    for feature in X.columns:
        feature_probs[feature] = {}
        for cls in classes:
            subset = X[y == cls]
            feature_prob = subset.groupby(feature).size() / subset.shape[0]
            feature_probs[feature][cls] = feature_prob.to_dict()
    
    return class_probs, feature_probs, classes

def predict(X, class_probs, feature_probs, classes):
    predictions = []
    probabilities = []
    for _, row in X.iterrows():
        class_probs_temp = {}
        for cls in classes:
            prob = class_probs[cls]
            for feature in X.columns:
                value = row[feature]
                feature_prob = feature_probs.get(feature, {}).get(cls, {}).get(value, 0)
                prob *= feature_prob
            class_probs_temp[cls] = prob
        
        total_prob = sum(class_probs_temp.values())
        if total_prob == 0:
            total_prob = 1

        class_probs_normalized = {cls: (prob / total_prob) for cls, prob in class_probs_temp.items()}
        
        predicted_class = max(class_probs_normalized, key=class_probs_normalized.get)
        predictions.append(predicted_class)
        probabilities.append(class_probs_normalized[predicted_class])
        
    return predictions, probabilities

data = pd.read_csv('q2.csv')

X = data.drop('buys_computer', axis=1)
y = data['buys_computer']

class_probs, feature_probs, classes = fit(X, y)

test_data = pd.DataFrame([{
    'age': '<=30',
    'income': 'medium',
    'student': 'yes',
    'credit_rating': 'fair'
}])

predictions, probabilities = predict(test_data, class_probs, feature_probs, classes)

print(f'Prediction: {predictions[0]}')
print(f'Probability: {probabilities[0]}')

Prediction: yes
Probability: 0.8350515463917526


In [22]:
# Question 3

# Write a Python function to implement the Naive Bayes classifier without using the scikit-learn library for the
# following sample training dataset stored as a .CSV file. Calculate the accuracy, precision, and recall for your train/test
# dataset.
# a. Build a classifier that determines whether a text is about sports or not.
# b. Determine which tag the sentence "A very close game" belongs to.

In [29]:
def fit(X, y):
    classes = y.unique()
    class_counts = y.value_counts()
    total_count = len(y)
    class_probs = {cls: count / total_count for cls, count in class_counts.items()}

    word_probs = {cls: {} for cls in classes}
    vocabulary = set()

    for cls in classes:
        subset = X[y == cls]
        words = ' '.join(subset).split()
        vocabulary.update(words)
        word_counts = pd.Series(words).value_counts()
        total_words = len(words)
        for word in vocabulary:
            word_probs[cls][word] = (word_counts.get(word, 0) + 1) / (total_words + len(vocabulary))

    return class_probs, word_probs, vocabulary, classes

def predict(X, class_probs, word_probs, vocabulary, classes):
    predictions = []
    confidences = []
    
    for text in X:
        words = text.split()
        class_probs_score = {}
        for cls in classes:
            prob = class_probs[cls]
            for word in words:
                prob *= word_probs[cls].get(word, 1 / (sum(word_probs[cls].values()) + len(vocabulary)))
            class_probs_score[cls] = prob

        total_score = sum(class_probs_score.values())
        class_probs_normalized = {cls: score / total_score for cls, score in class_probs_score.items()}

        predicted_class = max(class_probs_normalized, key=class_probs_normalized.get)
        predictions.append(predicted_class)
        confidences.append(class_probs_normalized[predicted_class])
    
    return predictions, confidences

df = pd.read_csv("q3.csv")

class_probs, word_probs, vocabulary, classes = fit(df['text'], df['tag'])

y_true = df['tag']
y_pred, confidences = predict(df['text'], class_probs, word_probs, vocabulary, classes)

test_sentence = ["A very close game"]
predicted_tag, confidence = predict(test_sentence, class_probs, word_probs, vocabulary, classes)
print(f'Prediction: {predicted_tag[0]}')
print(f'Probability: {confidence[0]}')

Prediction: sports
Probability: 0.9417083804295171
