In [2]:
import pandas as pd 
import numpy as np

In [4]:
train = pd.read_csv(r'/Users/lfarias/Downloads/Kaggle-IFT3395/data/train.csv')
test = pd.read_csv(r'/Users/lfarias/Downloads/Kaggle-IFT3395/data/test.csv')

# Extract features for the training data
X_train = train.iloc[:, :-1].values  # Features in the training data
y_train = train.iloc[:, -1].values   # Labels in the training data

# Extract features for the testing data (no labels to extract)
X_test = test.values    # Features in the testing data


In [5]:
means = train.groupby(["Label"]).mean()  # Estimate mean of each class, feature
var = train.groupby(["Label"]).var()     # Estimate variance of each class, feature
prior = (train.groupby("Label").count() / len(train)).iloc[:, 1]  # Estimate prior probabilities
classes = np.unique(train["Label"].tolist())  # Storing all possible classes

In [6]:
def Normal(n, mu, var):
    # Function to return pdf of Normal(mu, var) evaluated at x
    sd = np.sqrt(var)
    pdf = (np.e ** (-0.5 * ((n - mu) / sd) ** 2)) / (sd * np.sqrt(2 * np.pi)
    
    return pdf

In [10]:
# Calculate class probabilities and class conditional probabilities for the test data
class_probs = {}
class_conditional_probs = []

In [11]:
for i in range(len(X_test)):
    instance = X_test.iloc[i]
    class_likelihood = []
    
    for cls in classes:
        feature_likelihoods = [np.log(prior[cls])]  # Append log prior of class 'cls'
        
        for col in X_train.columns:
            data = instance[col]
            
            mean = means[col].loc[cls]
            variance = var[col].loc[cls]
            
            likelihood = Normal(data, mean, variance)
            
            if likelihood != 0:
                likelihood = np.log(likelihood)
            else:
                likelihood = 1 / len(train)
            
            feature_likelihoods.append(likelihood)
        
        total_likelihood = sum(feature_likelihoods)
        class_likelihood.append(total_likelihood)
    
    max_index = class_likelihood.index(max(class_likelihood))
    predicted_label = classes[max_index]
    
    class_probs[i + 1] = predicted_label  # Store results in a dictionary

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
# Create a DataFrame with 'SNo' and 'Label' columns
results = pd.DataFrame(class_probs.items(), columns=['SNo', 'Label'])

# Save the results to a CSV file
results.to_csv('results.csv', index=False)

In [7]:
def Predict(X):
    Predictions = []
    
    for i in X.index: # Loop through each instances
        
        ClassLikelihood = []
        instance = X.loc[i]
        
        for cls in classes: # Loop through each class
            
            FeatureLikelihoods = []
            FeatureLikelihoods.append(np.log(prior[cls])) # Append log prior of class 'cls'
            
            for col in X_train.columns: # Loop through each feature
                
                data = instance[col]
                
                mean = means[col].loc[cls] # Find the mean of column 'col' that are in class 'cls'
                variance = var[col].loc[cls] # Find the variance of column 'col' that are in class 'cls'
                
                Likelihood = Normal(data, mean, variance)
                
                if Likelihood != 0:
                    Likelihood = np.log(Likelihood) # Find the log-likelihood evaluated at x
                else:
                    Likelihood = 1/len(train) 
                
                FeatureLikelihoods.append(Likelihood)
                
            TotalLikelihood = sum(FeatureLikelihoods) # Calculate posterior
            ClassLikelihood.append(TotalLikelihood)
            
        MaxIndex = ClassLikelihood.index(max(ClassLikelihood)) # Find largest posterior position
        Prediction = classes[MaxIndex]
        Predictions.append(Prediction)
        
    return Predictions

In [8]:
PredictTrain = Predict(train)
# PredictTest = Predict(X_test)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [9]:
PredictTest = Predict(test)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [12]:
import pandas as pd
import numpy as np

# Define the GaussianNaiveBayes class
class GaussianNaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for i, c in enumerate(self._classes):
            X_for_class_c = X[y == c]
            self._mean[i, :] = X_for_class_c.mean(axis=0)
            self._var[i, :] = X_for_class_c.var(axis=0)
            self._priors[i] = X_for_class_c.shape[0] / float(n_samples)

    def _calculate_likelihood(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        num = np.exp(- (x - mean) ** 2 / (2 * var))
        denom = np.sqrt(2 * np.pi * var)
        return num / denom 

    def predict(self, X):
        y_pred = [self._classify_sample(x) for x in X]
        return np.array(y_pred)

    def _classify_sample(self, x):
        posteriors = []
        
        for i, c in enumerate(self._classes):
            prior = np.log(self._priors[i])
            posterior = np.sum(np.log(self._calculate_likelihood(i, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
        
        return self._classes[np.argmax(posteriors)]

# Load the train and test data
train = pd.read_csv('/Users/lfarias/Downloads/Kaggle-IFT3395/data/train.csv')
test = pd.read_csv('/Users/lfarias/Downloads/Kaggle-IFT3395/data/test.csv')

# Extract features for the training data
X_train = train.iloc[:, :-1]  # Features in the training data
y_train = train.iloc[:, -1]    # Labels in the training data

# Extract features for the testing data (no labels to extract)
X_test = test.values    # Features in the testing data

# Fit the Gaussian Naive Bayes model
gnb = GaussianNaiveBayes()
gnb.fit(X_train.values, y_train.values)

# Make predictions on the test data
predicted_labels = gnb.predict(X_test)

# Create a DataFrame with 'SNo' and 'Label' columns
results = pd.DataFrame({'SNo': test['SNo'], 'Label': predicted_labels})

# Save the results to a CSV file
results.to_csv('results.csv', index=False)
