In [25]:
#importing necessary libraries
import math
import pandas as pd
import numpy as np

In [26]:
#getting the training data from the .txt files
#and creating the corresponding dataframes
def get_train_data():
  global train_df, train_df_0, train_df_1, train_features
  train_features = pd.read_csv('train-features.txt', delimiter=' ', header = None)
  train_labels = pd.read_csv('train-labels.txt', delimiter=' ', header = None)
  train_df = pd.merge(train_features, train_labels, left_index=True, right_index=True)
  train_df_0=train_df.loc[train_df['0_y'] == 0] # train data with label=0
  train_df_1=train_df.loc[train_df['0_y'] == 1] # train data with label=1

In [27]:
get_train_data()

In [28]:
def skew():
  print("for y=0, number of examples:", len(train_df_0))
  print("for y=1, number of examples:", len(train_df_1))
  ratio=(len(train_df_0)/(len(train_df_0)+len(train_df_1)))
  if ratio==0.5:
    print("There are equal amounts of examples per class. Thus, the data is not skewed in any direction.")
  elif 0.35<ratio<0.65:
    print("The data is slightly skewed.")
  else:
    print("The data is skewed and possibly will impact the classifier accuraracy.")

In [29]:
skew()

for y=0, number of examples: 350
for y=1, number of examples: 350
There are equal amounts of examples per class. Thus, the data is not skewed in any direction.


In [30]:
def params_to_estimate():
  n=train_features.shape[1] #n many Xi's, in this case words
  no_params=(2*n)+1 #number of parameters to be estimated
  print("There are", n, "Xi's and 2 classes.")
  print("With Naive Bayes, there are", no_params, "parameters to estimate.")

In [31]:
params_to_estimate()

There are 2500 Xi's and 2 classes.
With Naive Bayes, there are 5001 parameters to estimate.


In [62]:
#calculating the priors
def priors():

  global P_y0, P_y1
  P_y0=len(train_df_0)/(len(train_df_0)+len(train_df_1)) # P(Y=0)
  P_y1=len(train_df_1)/(len(train_df_0)+len(train_df_1)) # P(Y=1)

  global P_x_given_y0, P_x_given_y1, train0, train1, count_X_given_y0, count_X_given_y1
  train0=train_df_0.drop(columns=["0_y"]) 
  train1=train_df_1.drop(columns=["0_y"]) 
  count_X_given_y0=train0.sum(axis=0)
  count_X_given_y1=train1.sum(axis=0)
  P_x_given_y0=np.array(count_X_given_y0)/(train0.values.sum()) # P(Xj given Y=0)
  P_x_given_y1=count_X_given_y1/(train1.values.sum()) # P(Xj given Y=1)

  global log_X_y0, log_X_y1
  log_X_y0=[] # log of P(Xj given Y=0)
  for item in P_x_given_y0:
    if item==0:
      log_X_y0.append(0) #otherwise log(0) goes to -inf
    else:
      log_X_y0.append(np.log(item)) #numpy.log is the natural logarithm
    
  log_X_y1=[] # log of P(Xj given Y=1)
  for item in P_x_given_y1:
    if item==0:
      log_X_y1.append(0) #otherwise log(0) goes to -inf
    else:
      log_X_y1.append(np.log(item))
  
  #This part will display a warning if any probability is not in range [0,1]
  error=False
  if not 1>P_y0>0:
    error=True
  elif not 1>P_y1>0:
    error=True
  if max(P_x_given_y0)>1: #min-max functions becuse P_x_given_y0 is an array
    error=True
  elif min(P_x_given_y0)<0:
    error=True
  if max(P_x_given_y1)>1: #min-max functions becuse P_x_given_y1 is an array
    error=True
  elif min(P_x_given_y1)<0:
    error=True
  if error==True:
    print("Problem with the prior probabilities!")

In [33]:
priors()

In [34]:
#getting the test data from the .txt files
#and creating the corresponding dataframes
def get_test_data():
  global test_features, test_labels, test_df
  test_features = pd.read_csv('test-features.txt', delimiter=' ', header = None)
  test_labels = pd.read_csv('test-labels.txt', delimiter=' ', header = None)
  test_df = pd.merge(test_features, test_labels, left_index=True, right_index=True)

In [35]:
get_test_data()

In [36]:
#calculations for the predictions
def predictions():

  sum_0_list=[] 
  for i in range(1,len(test_features)+1):
    sum0=sum(test_features[i-1:i]*log_X_y0)+np.log(P_y0) #summation of argmax(y) for Y=0
    sum_0_list.append(sum0)

  #Y=1
  sum_1_list=[]
  for i in range(1,len(test_features)+1):
    sum1=sum(test_features[i-1:i]*log_X_y1)+np.log(P_y1) #summation of argmax(y) for Y=1
    sum_1_list.append(sum1)
    
  global pred_label
  pred_label=[] #stores the predicted labels 
  for i in range(len(sum_1_list)):
    if sum_0_list[i]>=sum_1_list[i]: #not spam in case of ties
      pred_label.append(0) 
    else:
      pred_label.append(1)

  if len(pred_label)!=len(test_features)!=len(sum_0_list)!=len(sum_1_list):
    print("No of predictions doesn't match no of test instances!")
  
  global pred_df
  pred_df=test_labels
  pred_df['Predicted Label'] = np.array(pred_label)
  pred_df.rename(columns = {0:'Actual Label'}, inplace = True)
  return pred_df

In [37]:
predictions()

Unnamed: 0,Actual Label,Predicted Label
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
255,1,0
256,1,0
257,1,0
258,1,0


In [39]:
def performance_and_comments():

  #Accuracy
  pred_df['Residual Error'] = abs(pred_df["Actual Label"]-pred_df["Predicted Label"]) 
    #0 for accurate, 1 for inaccurate classification

  class_error=(pred_df['Residual Error'].values.sum())/len(pred_df)
  print("The classification error is: ", class_error*100, "%",sep="")
  print("The classification accuracy is: ", (1-class_error)*100, "%", sep="")
  
  #Predictions for Spam (Y=1)
  count_spam=pred_df.loc[pred_df["Predicted Label"] == 1].count()[1]
  print("Number of Spam Predictions:", count_spam)

  #Predictions for Spam (Y=0)
  count_ham=pred_df.loc[pred_df["Predicted Label"] == 0].count()[1]
  print("Number of not Spam Predictions:", count_ham)

  return pred_df
  
  if count_spam+count_ham!=len(test_features):
    print("Error in classification!")

In [40]:
performance_and_comments()

The classification error is: 50.0%
The classification accuracy is: 50.0%
Number of Spam Predictions: 0
Number of not Spam Predictions: 260


Unnamed: 0,Actual Label,Predicted Label,Residual Error
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
255,1,0,1
256,1,0,1
257,1,0,1
258,1,0,1


In [42]:
def smoothed_predictions():
  
  alpha=1 
  V = test_features.shape[1] #Vocabulary size, ie no of features, ie no of columns in test_features

  s_P_x_given_y0=(count_X_given_y0+alpha)/(train0.values.sum()+(alpha*V))
  s_P_x_given_y1=(count_X_given_y1+alpha)/(train1.values.sum()+(alpha*V))
  
  s_log_X_y0=[] # log of P(Xj given Y=0)
  for item in s_P_x_given_y0:
    s_log_X_y0.append(np.log(item)) #numpy.log is the natural logarithm
    
  s_log_X_y1=[] # log of P(Xj given Y=1)
  for item in s_P_x_given_y1:
    s_log_X_y1.append(np.log(item))
  
  #This part will display a warning if any probability is not in range [0,1]
  error=False
  if max(s_P_x_given_y0)>1: #min-max functions becuse P_x_given_y0 is an array
    error=True
  elif min(s_P_x_given_y0)<0:
    error=True
  if max(s_P_x_given_y1)>1: #min-max functions becuse P_x_given_y1 is an array
    error=True
  elif min(s_P_x_given_y1)<0:
    error=True
  if error==True:
    print("Problem with the prior probabilities!")

  #Y=0
  s_sum_0_list=[] 
  for i in range(1,len(test_features)+1):
    item=(test_features[i-1:i]*s_log_X_y0).sum(axis=1)
    a=list(item)[0]
    s_sum0=a+np.log(P_y0)
    s_sum_0_list.append(s_sum0)

  #Y=1
  s_sum_1_list=[]
  for i in range(1,len(test_features)+1):
    item=(test_features[i-1:i]*s_log_X_y1).sum(axis=1) 
    item=list(item)[0]
    s_sum1=item+np.log(P_y1)
    s_sum_1_list.append(s_sum1)


  global smoothed_pred_label
  smoothed_pred_label=[] #stores the predicted labels 
  for i in range(len(s_sum_1_list)):
    if s_sum_0_list[i]>=s_sum_1_list[i]: #not spam in case of ties
      smoothed_pred_label.append(0) 
    else:
      smoothed_pred_label.append(1)

  if len(smoothed_pred_label)!=len(test_features)!=len(s_sum_0_list)!=len(s_sum_1_list):
    print("No of predictions doesn't match no of test instances!")
  
  global smoothed_pred_df
  smoothed_pred_df=test_labels
  smoothed_pred_df['Predicted Label'] = np.array(smoothed_pred_label)
  smoothed_pred_df.rename(columns = {0:'Actual Label'}, inplace = True)
  return smoothed_pred_df

In [43]:
smoothed_predictions()

Unnamed: 0,Actual Label,Predicted Label,Residual Error
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
255,1,1,1
256,1,1,1
257,1,1,1
258,1,1,1


In [60]:
def sm_performance_and_comments():
    #Accuracy
  smoothed_pred_df['Residual Error'] = abs(smoothed_pred_df["Actual Label"]-smoothed_pred_df["Predicted Label"]) 
    #0 for accurate, 1 for inaccurate classification

  global class_error, class_acc
  class_error=(smoothed_pred_df['Residual Error'].values.sum())/len(smoothed_pred_df)
  print("The classification error is: ", format(class_error*100,".2f"),"%", sep="")
  class_acc=1-class_error
  print("The classification accuracy is: ", format(class_acc*100,".2f"), "%", sep="")

  #Predictions for Spam (Y=1)
  count_spam=smoothed_pred_df.loc[smoothed_pred_df["Predicted Label"] == 1].count()[1]
  print("Number of Spam Predictions:", count_spam)

  #Predictions for Spam (Y=0)
  count_ham=smoothed_pred_df.loc[smoothed_pred_df["Predicted Label"] == 0].count()[1]
  print("Number of not Spam Predictions:", count_ham)

  return smoothed_pred_df
  
  if count_spam+count_ham!=len(test_features):
    print("Error in classification!")

In [61]:
sm_performance_and_comments()

The classification error is: 2.69%
The classification accuracy is: 97.31%
Number of Spam Predictions: 129
Number of not Spam Predictions: 131


Unnamed: 0,Actual Label,Predicted Label,Residual Error
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
255,1,1,0
256,1,1,0
257,1,1,0
258,1,1,0
