In [157]:
import sys                       # for testing use only
import os                        # for testing use only
from datetime import datetime    # for testing use only
import random                    # for testing use only
import hashlib                   # for testing use only
import pandas as pd
import numpy as np
import math 
import statistics

In [158]:
def load_dataset(file_name, category_col_name):#takes dataset's file name and the label name. returns vectors df, label series and label mapping.
    df = pd.read_csv(file_name)
    df[category_col_name], category_map = pd.factorize(df[category_col_name])
    
    return df.drop(columns=category_col_name), df[category_col_name], pd.Series(category_map)

In [159]:
def calc_class_priors(y_train, lst_class_vals):
    class_priors = []
    for cls in lst_class_vals:
        class_priors.append(y_train.value_counts().iloc[cls]/len(y_train))
    return class_priors

In [160]:
def calc_mean_for_likelihood(training_set, category_col_name):
    class_lst=training_set[category_col_name].unique()
    class_lst=np.sort(class_lst)
    mean_dict={};

    for feat in training_set.drop(columns=category_col_name): 
        mean_dict[feat]=[]
        for cls in class_lst:
            mean_dict[feat].append(training_set[(training_set[category_col_name]==cls)][feat].mean())
    df=pd.DataFrame(mean_dict,index=class_lst)
    return df

In [162]:
def calc_std_for_likelihood(training_set, category_col_name):
    std_dict={}
    class_lst=training_set[category_col_name].unique()
    class_lst=np.sort(class_lst)
    for feat in training_set.drop(columns=category_col_name):
        std_dict[feat]=[]
        for cls in class_lst:
            std_dict[feat].append(training_set[(training_set[category_col_name])==cls][feat].std())
    df=pd.DataFrame(std_dict, index=class_lst)
    return df

In [163]:
def fit(training_set,category_col_name):
    class_list=np.sort(training_set[category_col_name].unique()) #sorting class label to use as an indices
    
    lst_class_priors = calc_class_priors(training_set[category_col_name],class_list)#claculation priors
    df_mean_likelihood=calc_mean_for_likelihood(training_set, category_col_name)#claculation mean
    df_std_likelihood=calc_std_for_likelihood(training_set, category_col_name)#claculation std
    return lst_class_priors, df_mean_likelihood, df_std_likelihood

In [164]:
def calc_gaussian_pdf_prob(x_feature_val, feature_mean, feature_std):
    exponent = np.exp(-((x_feature_val-feature_mean)**2 / (2 * feature_std**2 )))
    return (1 / ((2 * np.pi)**(1/2) * feature_std)) * exponent

In [106]:
def calc_aposterior_probs(X_test, lst_class_priors, df_trained_mean, df_trained_std, lst_class_vals):
    num_classes = len(lst_class_vals)
    df_prob_per_test_inst_per_class = pd.DataFrame(np.zeros((X_test.shape[0], num_classes)), columns=lst_class_vals, index=X_test.index)
    # YOUR CODE HERE
    for cls, prior in enumerate(lst_class_priors): #for each class #cls represents the lst_class_priors index and the actual class number
        for row_index, vector in X_test.iterrows():#extacting feature vector from the test set
            df_prob_per_test_inst_per_class.iloc[row_index,cls]=prior
            for feat, x_val in vector.items(): #for each feature vector, x value
                mean = df_trained_mean.loc[cls,feat]
                std = df_trained_std.loc[cls,feat]
                
                feat_prob = calc_gaussian_pdf_prob(x_val, mean, std) #calculate the conditional prob of x value
                
                df_prob_per_test_inst_per_class.iloc[row_index, cls]*=feat_prob #prior and conditional probs product
            
           
    return df_prob_per_test_inst_per_class

In [107]:
def predict(X_test, lst_class_priors, df_trained_mean, df_trained_std, lst_class_vals):
    # YOUR CODE HERE
    y=pd.Series(0, index=range(X_test.shape[0])) #creating empty prediction series
    df=calc_aposterior_probs(X_test, lst_class_priors, df_trained_mean, df_trained_std, lst_class_vals)
    
    for vector_index, probs_per_cls in df.iterrows():
        y.iloc[vector_index]=np.argmax(probs_per_cls)
    return y

In [169]:
file_name = str(os.getcwd())+str(os.sep)+'Iris.csv'
label_col='species'
X, y, label_mapping=load_dataset(file_name, label_col)
train_set=X.copy(deep=True)
train_set[label_col]=y

In [171]:
lst_priors, df_mean, df_std=fit(train_set,label_col)

In [156]:
class_priors=calc_class_priors(y,label_mapping.index.tolist())
class_priors

[0.5, 0.35, 0.15]

In [111]:
ed=pd.read_csv(file_name)
ed

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
18,5.7,3.8,1.7,0.3,Iris-setosa
21,5.1,3.7,1.5,0.4,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
41,4.5,2.3,1.3,0.3,Iris-setosa


In [143]:
#def drop_rand_rows(label_ratio_dict, df, label_col_name):
 #   new_df_list=[]
  #  for lbl in label_ratio_dict:
   #     new_df_list.append(df[(df[label_col_name]==lbl)].sample(n=label_ratio_dict[lbl]))
    #return pd.concat(new_df_list, axis=0)

In [145]:
label_ratio={
    'Iris-setosa': 15,
    'Iris-versicolor': 35,
    'Iris-virginica': 50
}

    
clear_df=drop_rand_rows(label_ratio, ed, label_col)

In [153]:
clear_df[label_col].value_counts()

Iris-virginica     50
Iris-versicolor    35
Iris-setosa        15
Name: species, dtype: int64

In [154]:
clear_df.to_csv('Iris.csv')