In [16]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Question 1

In [5]:
# Function to calculate prior probabilities
def calculate_prior_probabilities(df, class_column):
    # Get the total number of instances
    total_instances = len(df)
    
    # Get the count of each class
    class_counts = df[class_column].value_counts()
    
    # Calculate the prior probability for each class
    prior_probabilities = class_counts / total_instances
    
    return prior_probabilities

# Load your dataset
df = pd.read_excel("D:\\ThirdYear\\19cse305\\lab8\\lab8_data.xlsx")

# Specify the class label column
class_column = 'buys_computer'

# Calculate prior probabilities
prior_probabilities = calculate_prior_probabilities(df, class_column)

# Print prior probabilities
print("Prior Probabilities for each class in 'buys_computer':")
print(prior_probabilities)


Prior Probabilities for each class in 'buys_computer':
buys_computer
yes    0.642857
no     0.357143
Name: count, dtype: float64


Question 2

In [7]:
# Function to calculate class conditional densities for categorical features
def calculate_class_conditional_densities(df, class_column):
    # Separate the dataset by class
    class_groups = df.groupby(class_column)
    
    # Create a dictionary to store conditional densities for each feature and class
    conditional_densities = {}
    
    # Loop through each feature in the dataset (excluding the class column)
    for feature in df.columns:
        if feature != class_column:
            # Create a dictionary for the current feature
            feature_densities = {}
            
            # Loop through each class (e.g., "yes" and "no" in 'buys_computer')
            for class_value, group in class_groups:
                # Calculate the value counts for the feature within this class
                value_counts = group[feature].value_counts(normalize=True)
                
                # Check for zero probabilities and fill with zero if missing
                all_possible_values = df[feature].unique()
                for value in all_possible_values:
                    if value not in value_counts:
                        value_counts[value] = 0.0
                
                # Sort values by the unique feature values
                value_counts = value_counts.sort_index()
                
                # Store the conditional densities for this class
                feature_densities[class_value] = value_counts
            
            # Store the densities for the current feature
            conditional_densities[feature] = feature_densities
    
    return conditional_densities


# Specify the class label column
class_column = 'buys_computer'

# Calculate class conditional densities
class_conditional_densities = calculate_class_conditional_densities(df, class_column)

# Print the class conditional densities
print("Class Conditional Densities:")
for feature, densities in class_conditional_densities.items():
    print(f"\nFeature: {feature}")
    for class_value, density in densities.items():
        print(f"  Class: {class_value}")
        print(density)


Class Conditional Densities:

Feature: age
  Class: no
age
31...40    0.0
<=30       0.6
>40        0.4
Name: proportion, dtype: float64
  Class: yes
age
31...40    0.444444
<=30       0.222222
>40        0.333333
Name: proportion, dtype: float64

Feature: income
  Class: no
income
high      0.4
low       0.2
medium    0.4
Name: proportion, dtype: float64
  Class: yes
income
high      0.222222
low       0.333333
medium    0.444444
Name: proportion, dtype: float64

Feature: student
  Class: no
student
no     0.8
yes    0.2
Name: proportion, dtype: float64
  Class: yes
student
no     0.333333
yes    0.666667
Name: proportion, dtype: float64

Feature: credit_rating
  Class: no
credit_rating
excellent    0.6
fair         0.4
Name: proportion, dtype: float64
  Class: yes
credit_rating
excellent    0.333333
fair         0.666667
Name: proportion, dtype: float64


Question 3

In [9]:
# Function to perform chi-square test between two categorical features
def chi_square_test(df, feature1, feature2):
    # Create a contingency table
    contingency_table = pd.crosstab(df[feature1], df[feature2])
    
    # Perform Chi-Square test
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    # Return the test results
    return chi2, p_value


# List of features to test for independence
features = ['age', 'income', 'student', 'credit_rating']

# Perform chi-square tests for all pairs of features
for i in range(len(features)):
    for j in range(i + 1, len(features)):
        feature1 = features[i]
        feature2 = features[j]
        chi2, p_value = chi_square_test(df, feature1, feature2)
        
        print(f"Chi-Square test between {feature1} and {feature2}:")
        print(f"  Chi2 Statistic = {chi2}")
        print(f"  p-value = {p_value}")
        if p_value < 0.05:
            print(f"  Result: {feature1} and {feature2} are dependent.")
        else:
            print(f"  Result: {feature1} and {feature2} are independent.")
        print("\n")


Chi-Square test between age and income:
  Chi2 Statistic = 3.3249999999999997
  p-value = 0.5049810026322079
  Result: age and income are independent.


Chi-Square test between age and student:
  Chi2 Statistic = 0.4
  p-value = 0.8187307530779818
  Result: age and student are independent.


Chi-Square test between age and credit_rating:
  Chi2 Statistic = 0.11666666666666664
  p-value = 0.9433354498734922
  Result: age and credit_rating are independent.


Chi-Square test between income and student:
  Chi2 Statistic = 5.666666666666666
  p-value = 0.05881647164242991
  Result: income and student are independent.


Chi-Square test between income and credit_rating:
  Chi2 Statistic = 0.7291666666666666
  p-value = 0.6944859597510076
  Result: income and credit_rating are independent.


Chi-Square test between student and credit_rating:
  Chi2 Statistic = 0.0
  p-value = 1.0
  Result: student and credit_rating are independent.




Question 4

In [12]:
# Function to load and prepare the data
def load_data(file_path, class_column):
    # Load the dataset
    df = pd.read_excel(file_path)
    
    # Separate features (X) and target variable (y)
    X = df.drop(columns=[class_column])  # All columns except the target
    y = df[class_column]                 # Target column (class label)
    
    # Convert categorical variables to numerical (if needed)
    X = pd.get_dummies(X, drop_first=True)  # One-hot encode categorical variables
    
    return X, y

# Function to train a Naive Bayes classifier
def train_naive_bayes(X, y):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Initialize the GaussianNB model
    model = GaussianNB()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training set and test set
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy for training and test sets
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return model, train_accuracy, test_accuracy

# Path to your dataset
file_path = "D:\\ThirdYear\\19cse305\\lab8\\lab8_data.xlsx"  # Replace with the actual path
class_column = 'buys_computer'  # Specify the class label column

# Load and prepare data
X, y = load_data(file_path, class_column)

# Train Naive Bayes model
model, train_accuracy, test_accuracy = train_naive_bayes(X, y)

# Print results
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Training Accuracy: 77.78%
Test Accuracy: 60.00%


Question 5

In [17]:
# Function to load and prepare the data
def load_and_prepare_data():
    # Load data
    df = pd.read_csv('D:\\ThirdYear\\19cse305\\lab8\\D2-Processed-features.csv')

    # Encode categorical columns
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

    return df, label_encoders

In [18]:
df, label_encoders = load_and_prepare_data()
# Prepare the data
X = df.drop(columns=['believability'])  # Features (exclude target)
y = df['believability']                 # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Gaussian Naive Bayes model
model = GaussianNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracy for training and test sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Output the results
train_accuracy, test_accuracy


(0.28303747534516766, 0.26436781609195403)