# Import Training Data, Add Labels, Check

In [1]:
import numpy as np
import pandas as pd
import matplotlib
#add labels, print dataframes to check
import numpy as np
train_map = pd.read_csv('train.map', header = None, delimiter=' ', names= ['labels', 'label_id'])
train_data = pd.read_csv('train.data', header = None, delimiter=' ', names=['doc_id', 'word_id', 'count'] )
train_label = pd.read_csv('train.label', header = None, delimiter=' ', names= ['label_id'])
vocab_list = pd.read_csv('vocabulary.txt', header = None, delimiter=' ', names= ['word_list']).reset_index()
train_label=train_label.reset_index()
train_label.columns = ['doc_id', 'label_id']
vocab_list.columns=['word_id', 'words']
#add one to the doc ID so that there are no zeroes
train_label['doc_id']=train_label['doc_id']+1
vocab_list['word_id']=vocab_list['word_id']+1
#print train_map
#print train_data[:10]
#train_label
vocab_list.head(5)

Unnamed: 0,word_id,words
0,1,archive
1,2,name
2,3,atheism
3,4,resources
4,5,alt


# Merge Training Data Tables

In [2]:
train_data_joined = pd.merge( train_label, train_data, on='doc_id', how="inner")
train_data_joined.head(5)

Unnamed: 0,doc_id,label_id,word_id,count
0,1,1,1,4
1,1,1,2,2
2,1,1,3,10
3,1,1,4,4
4,1,1,5,2


# Get Total Doc Count & Docs Per Class

In [3]:
#Get total document count by searching the unique values, this could also be done by taking max of the ID 
total_doc_count= train_data_joined['doc_id'].unique().shape
# Count number of docs for each class
docs_per_class=train_data_joined.groupby('label_id').doc_id.nunique()

# Calculate Pi for Classes

In [4]:
# Calculate Pi for Each Class, this is the number of documents in a class/total number of docs
df_docs_frac_per_class=np.log(train_data_joined.groupby('label_id').doc_id.nunique()/total_doc_count)
pi_values=df_docs_frac_per_class.reset_index()
pi_values.columns=['label_id','log_pi']
pi_values['log_pi']

0    -3.156025
1    -2.965060
2    -2.980672
3    -2.954786
4    -2.975441
5    -2.946304
6    -2.963340
7    -2.946304
8    -2.939570
9    -2.942932
10   -2.936220
11   -2.942932
12   -2.947995
13   -2.942932
14   -2.944616
15   -2.934549
16   -3.029025
17   -2.994757
18   -3.189926
19   -3.400222
Name: log_pi, dtype: float64

# Calculate Total Words Per Class

In [5]:
words_per_class=train_data_joined[['label_id','count']].groupby('label_id').sum().reset_index()
words_per_class.columns=['label_id','words_per_label']
total_words_per_class=words_per_class
total_words_per_class.head(5)

Unnamed: 0,label_id,words_per_label
0,1,148812
1,2,110358
2,3,90767
3,4,99146
4,5,86190


# Calculate Total Words Per Doc

In [6]:
sum_words_per_doc=train_data_joined.groupby(['label_id','word_id'])['count'].sum().reset_index()
sum_words_per_doc.head(5)

Unnamed: 0,label_id,word_id,count
0,1,1,13
1,1,2,63
2,1,3,275
3,1,4,9
4,1,5,82


# Set Up Training Data Probability Distribution (Pj)

In [7]:
#count of a specific word in the same class/total number of words in each class
#Alpha value for smoothing
a = 0.001

#Calculate probability of each word based on the class

pb_j=train_data_joined.groupby('label_id')
pb_ij=train_data_joined[['label_id','word_id','count']].groupby(['label_id','word_id'])
Prob =  (pb_ij['count'].sum() + 1) / (pb_j['count'].sum() + 61188) 
Prob.unstack()

#add one for all of the null values as (1/count+V+1)
for c in range(1,21):
    Prob.loc[c,:] = Prob.loc[c,:].fillna(a/(pb_j['count'].sum()[c] + 16689))
    
#Convert to dictionary for greater speed
Prob_dict = Prob.to_dict()

Prob_dict

  return np.sum(name == np.asarray(self.names)) > 1


{(9, 12673): 1.2208596072494644e-05,
 (14, 16286): 1.3855149035219789e-05,
 (11, 4904): 6.9151169395668176e-05,
 (8, 2486): 0.00010839180786125848,
 (15, 20554): 9.3065676447869257e-06,
 (7, 8217): 3.2711273940563612e-05,
 (20, 53255): 1.664041179472388e-05,
 (17, 26755): 1.6870376462450761e-05,
 (19, 14921): 1.2115631587874677e-05,
 (14, 8693): 9.2367660234798597e-06,
 (11, 2261): 7.4090538638215902e-05,
 (8, 4049): 3.4228991956186887e-05,
 (16, 47256): 1.9050884913604238e-05,
 (13, 28518): 6.0841683854442355e-05,
 (12, 18895): 7.6439742550947096e-06,
 (14, 10832): 2.7710298070439578e-05,
 (11, 3710): 2.4696846212738633e-05,
 (14, 41316): 1.8473532046959719e-05,
 (5, 5004): 1.3570546485906987e-05,
 (18, 49858): 6.3292541290471624e-06,
 (17, 5205): 2.9523158809288828e-05,
 (9, 7536): 1.8312894108741967e-05,
 (6, 23800): 1.4016464673836866e-05,
 (14, 43971): 9.2367660234798597e-06,
 (11, 36559): 9.8787384850954535e-06,
 (20, 8025): 1.1093607863149254e-05,
 (4, 19466): 3.7421881821697205

# Import Test Data

In [8]:
test_map = pd.read_csv('test.map', header = None, delimiter=' ', names= ['test_labels', 'test_label_id'])
test_data = pd.read_csv('test.data', header = None, delimiter=' ', names=['test_doc_id', 'test_word_id', 'test_count'] )
test_label = pd.read_csv('test.label', header = None, delimiter=' ', names= ['test_label_id'])
test_label=test_label.reset_index()
test_label.columns = ['test_doc_id', 'label_id']
print test_map.head(5)
print test_data.head(5)
test_label.head(5)

                test_labels  test_label_id
0               alt.atheism              1
1             comp.graphics              2
2   comp.os.ms-windows.misc              3
3  comp.sys.ibm.pc.hardware              4
4     comp.sys.mac.hardware              5
   test_doc_id  test_word_id  test_count
0            1             3           1
1            1            10           1
2            1            12           8
3            1            17           1
4            1            23           8


Unnamed: 0,test_doc_id,label_id
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


# Merge Test Data

In [9]:
#test_label['test_doc_id']=test_label['test_doc_id']
merged_test_data = pd.merge( test_label, test_data, on='test_doc_id', how="inner")
merged_test_data.head(5)

Unnamed: 0,test_doc_id,label_id,test_word_id,test_count
0,1,1,3,1
1,1,1,10,1
2,1,1,12,8
3,1,1,17,1
4,1,1,23,8


# Get Single Test Document for Testing

In [10]:
df=merged_test_data.loc[merged_test_data['test_doc_id']==1];
df.columns = df.columns.str.replace('test_word_id','word_id')
#sum_words_per_doc=df.groupby(['word_id','doc_id'])['count'].sum().reset_index()
sum_words_per_doc.head(5)
df.head(5)

Unnamed: 0,test_doc_id,label_id,word_id,test_count
0,1,1,3,1
1,1,1,10,1
2,1,1,12,8
3,1,1,17,1
4,1,1,23,8


# Map Probability for a Doc that Belongs to Each Class
#1) Match on Doc ID, #2 Pull Out Label ID, Probability Word Belongs to that class, and count of word occurence in document; sort by word ID so you can see the map of each word mapping to each class #3 multiply words to get prior probability #4 sum prior probability over each class (it can be added due to the log). #5 raname so that ID's line up and pie values for each class can be multiplied #6 add up the pi with the prior prob, can add because of the log values #7 Answer gives the probability that the document belongs to each class
#Log(Probability(word/class)^N,words)summed

I ended up not using this, see other document

In [11]:
# single_doc_dist= pd.merge(prob_distr,df, on='word_id')
# single_doc_dist= single_doc_dist[['label_id_x','word_id','log(prob)','test_count']].sort_values(['word_id','label_id_x'])
# single_doc_dist ['prior_prob']= single_doc_dist['test_count']*single_doc_dist['log(prob)']
# single_doc_dist=single_doc_dist.groupby(['label_id_x'])['prior_prob'].sum().round(3).reset_index()
# single_doc_dist.columns=['label_id','prior_prob']
# single_doc_dist = pd.merge (single_doc_dist,pi_values, on='label_id' )
# single_doc_dist['probability']=single_doc_dist['prior_prob']+single_doc_dist['log_pi']
# single_doc_dist

# Setup New Classifier Function for All Words

In [12]:
#Calculate IDF
pb_ij = train_data_joined.groupby(['word_id'])
tot = len(train_data_joined['doc_id'].unique())
idf = np.log(tot/pb_ij['doc_id'].count())
idf_dict = idf.to_dict()

In [13]:
def Naive_Bayes_Classifer (train_data_joined, smooth = False, IDF = True):

    tr_data_dict = train_data_joined.to_dict()
    new_dict = {}
    prediction = []

    #new_dict = {docIdx : {wordIdx: count},....}
    for idx in range(len(tr_data_dict['doc_id'])):
        docIdx = tr_data_dict['doc_id'][idx]
        wordIdx = tr_data_dict['word_id'][idx]
        count = tr_data_dict['count'][idx]
        try: 
            new_dict[doc_id][word_id] = count 
        except:
            new_dict[tr_data_dict['doc_id'][idx]] = {}
            #new_dict[doc_id][word_id] = count

    for doc_id in range(1, len(new_dict)+1):
        score_dict = {}
        for class_id in range(1,21):
            score_dict[class_id] = 1
            for word_id in new_dict[doc_id]:
                try:
                    probability = Pr_dict[word_id][class_id]
                    power = new_dict[doc_id][word_id]   
                    score_dict[class_id] += power*np.log(probability) 
                    if IDF:
                        score_dict[class_id] += power*np.log(probability*IDF_dict[word_id]) 
                except:
                    score_dict[class_id] += 0          
            score_dict[class_id] +=  pi_values['log_pi'][class_id-1]       
        max_score = max(score_dict)
        prediction.append(max_score)
    return prediction

# Test Accuracy of Classifier

In [14]:
test_data = pd.read_csv('test.data', header = None, delimiter=' ', names=['test_doc_id', 'test_word_id', 'test_count'] )
test_label = pd.read_csv('test.label', header = None, delimiter=' ', names= ['test_label_id'])
test_label=test_label['test_label_id'].tolist()

predict = Naive_Bayes_Classifer(train_data_joined, smooth=False, IDF=True)

total = len(test_label)
val = 0
for i,j in zip(predict, test_label):
    if i == j:
        val +=1
    else:
        pass
print("Error:\t",(1-(val/total)) * 100, "%")

('Error:\t', 100, '%')
