In [1]:
# We are going to use Kaggle's playground data as an example. 
# The data files can be downloaded from https://www.kaggle.com/c/ghouls-goblins-and-ghosts-boo/data

In [2]:
import pandas as pd
import scipy.stats as stats

In [3]:
# Load the training data. To simplify this example we will be ignoring the color feature from the data.

In [4]:
training_data=pd.read_csv("../Data/GGG/train.csv")
training_data=training_data[['bone_length','rotting_flesh','hair_length','has_soul','type']]
training_data.head(n=5)

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,type
0,0.354512,0.350839,0.465761,0.781142,Ghoul
1,0.57556,0.425868,0.531401,0.439899,Goblin
2,0.467875,0.35433,0.811616,0.791225,Ghoul
3,0.776652,0.508723,0.636766,0.884464,Ghoul
4,0.566117,0.875862,0.418594,0.636438,Ghost


In [5]:
# Load the test data and ignore color feature

In [6]:
test_data=pd.read_csv("../Data/GGG/test.csv")
test_data=test_data[['id','bone_length','rotting_flesh','hair_length','has_soul']]
test_data.head(n=5)

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul
0,3,0.471774,0.387937,0.706087,0.698537
1,6,0.427332,0.645024,0.565558,0.451462
2,9,0.549602,0.491931,0.660387,0.449809
3,10,0.638095,0.682867,0.471409,0.356924
4,13,0.361762,0.583997,0.377256,0.276364


In [7]:
# Now we write a function to generate p(X|Y) for our Naive Bayes model 
# Note that we are using a Normal Distribution as the features are continuous
# The parameters (mean and standard deviation) of this distribution are 
# estimated from the training data

In [8]:
def p_x_y(test_x,train_series_given_y,features):
    mean=train_series_given_y.mean()
    std=train_series_given_y.std()
    p_x_y=[stats.norm.pdf(test_x[f],mean[f],std[f]) for f in features]
    p=1.0
    for l in p_x_y:
        p=p*l   
    return p

In [9]:
# Calculate p_x_y for every label for every test data
features=['bone_length','rotting_flesh','hair_length','has_soul']
index_probs=pd.DataFrame(columns=['index','label','num_prob'])
i=0
for index,row in test_data.iterrows():
    for label in ['Ghoul','Goblin','Ghost']:
        p=p_x_y(row[features],training_data[training_data['type']==label],features)  
        index_probs.loc[i]=[row['id'],label,p]
        i+=1

In [10]:
# For each id, choose label with max p_x_y

In [11]:
max_prob=index_probs.groupby('index').max()['num_prob'].reset_index()
final=index_probs.merge(max_prob)
final=final[['index','label']]
final.columns=['id','type']
final['id']=final['id'].astype(int)

In [12]:
final.head(n=10)

Unnamed: 0,id,type
0,3,Ghoul
1,6,Goblin
2,9,Ghoul
3,10,Ghost
4,13,Ghost
5,14,Ghost
6,15,Ghoul
7,16,Ghoul
8,17,Goblin
9,18,Ghoul
