# Naive Bay's Classifier

In [2]:
train_direct = '../training_data/train.pkl'
test_direct = '../testing_data/test.pkl'

In [3]:
import pandas as pd
from collections import Counter as ctr
from bs4 import BeautifulSoup
import collections as c

In [4]:
data = pd.read_pickle(train_direct)
train = data.dropna(subset=['label'])
train['text'] = train.msg.map(lambda x: x.get_text())  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Training

In [5]:
#
# Cleaning the text
#
train['clean'] = train.text.map(lambda x: x.lower().split())
train[:3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,msgID,msg,time,authorID,label,fine_grained,rank,affiliation,text,clean
1703,136322,<p>Me too Birdeye (it won't let me tag you <im...,2015-05-05T07:13:48+00:00,292,green,allClear,Uber contributor,0,Me too Birdeye (it won't let me tag you ) exce...,"[me, too, birdeye, (it, won't, let, me, tag, y..."
1704,136323,<p>This thread is making me giddy with happine...,2015-05-05T07:30:00+00:00,292,green,allClear,Uber contributor,0,This thread is making me giddy with happiness ...,"[this, thread, is, making, me, giddy, with, ha..."
1705,136327,<p>I found out my sister made herself an appoi...,2015-05-05T07:40:56+00:00,292,green,allClear,Uber contributor,0,I found out my sister made herself an appointm...,"[i, found, out, my, sister, made, herself, an,..."


### Probablity functions

In [6]:
from collections import Counter as ctr

t_counts = ctr(train.label)
w_counts = ctr(w for row in train.clean for w in row)
smooth=1e-10
def Pt(T=''):
    return t_counts[T]/len(data)

def Pw(W=''):
    if W not in w_counts:return smooth
    return w_counts[W]/sum(w_counts.values())    

In [7]:
#conditional propabilities
w_t_counts={}

for t in t_counts:
    T_subframe = train[train.label==t]
    w_t_counts[t] = ctr(w for row in T_subframe.clean for w in row)

def Pwt(W='',T=''):
    if W not in w_t_counts[T]:return smooth
    return w_t_counts[T][W]/sum(w_t_counts[T].values())

def Ptw(T='',W=''):
    return Pwt(W,T)*Pt(T)/Pw(W)


### Probability for entire sequence

In [8]:
import numpy as np
def Pts(T,S):
    ''' Returns Probability of a T, type, given a S, sequence of words'''
    return np.prod( [Ptw(T,w) for w in S] )

In [9]:
from operator import itemgetter
def P_types_s(S):
    ''' Returns probability of types for a given S, sequence of words.'''
    return [(t,Pts(t,S)) for t in t_counts]

def best_t_s(S):
    ''' Returns best predicted type for a given S, sequence of words.'''
    return max(P_types_s(S), key=itemgetter(1))[0]

## Testing

In [10]:
test = pd.read_pickle(test_direct)
test = test.dropna(subset=['label'])
test['text'] = test.msg.map(lambda x: x.get_text())
test['clean'] = test.text.map(lambda x: x.lower().split())

In [11]:
test['predict'] = test.clean.map(best_t_s)
test[:3]

Unnamed: 0,msgID,msg,authorID,label,fine_grained,rank,affiliation,text,clean,predict
7782,194795,Hey I've done that!! They have people who come...,5111,green,allClear,Builder,,Hey I've done that!! They have people who come...,"[hey, i've, done, that!!, they, have, people, ...",green
7806,194911,It is a separate issue yes but they just hate ...,5111,amber,followupOk,Builder,,It is a separate issue yes but they just hate ...,"[it, is, a, separate, issue, yes, but, they, j...",green
7815,194938,I'm really sorry but I don't want to call them...,5111,red,currentAcuteDistress,Builder,,I'm really sorry but I don't want to call them...,"[i'm, really, sorry, but, i, don't, want, to, ...",green


### Accuracy

In [12]:
#
# Most common baseline
#
print("Most common baseline: ",max(c.Counter(test['label']).values())/(len(test['label'])))

Most common baseline:  0.54


In [13]:
results = test.predict==test.label
print("Naive Bay's results: ",sum(results)/len(test))

Naive Bay's results:  0.5325


# Classifying based on ranks

### Probablity functions

In [14]:
r_counts = ctr(train['rank'])
#conditional propabilities
r_t_counts={}

def Pr(R=''):
    if R not in r_counts:return smooth
    return r_counts[R]/sum(r_counts.values())    

r_t_counts={}
for t in t_counts:
    T_subframe = train[train.label==t]
    r_t_counts[t] = ctr(T_subframe['rank'])

def Prt(R='',T=''):
    if R not in r_t_counts[T]:return smooth
    return r_t_counts[T][R]/sum(r_t_counts[T].values())

def Ptr(T='',R=''):
    return Prt(R,T)*Pt(T)/Pr(R)

### Probability for entire sequence

In [15]:
def P_types_r(r):
    ''' Returns probability of types for a given rank.'''
    return [(t,Ptr(t,r)) for t in t_counts]
def best_t_r(r):
    ''' Returns best predicted type for a given rank.'''
    return max(P_types_r(r), key=itemgetter(1))[0]
P_types_r('Builder')

[('green', 0.010872871046228711),
 ('amber', 0.004501216545012165),
 ('red', 0.0020833333333333333),
 ('crisis', 0.0006082725060827251)]

### Accuracy

In [16]:
test['predict_rank'] = test['rank'].map(best_t_r)
results = test.predict_rank==test.label
print("Classifying based on ranks: ",sum(results)/len(test))

Classifying based on ranks:  0.545


# Combining rank and NB

In [17]:
list_1 = set(train['rank'])
list_1 

{'Casual scribe',
 'Frequent Visitor',
 'Frequent scribe',
 'Mod',
 'Mod Squad',
 'Post Mod',
 'Rookie',
 'Rookie scribe',
 'Special Guest Contributor',
 'Star contributor',
 'Super frequent scribe',
 'Uber contributor',
 'Visitor',
 'Youth Ambassador'}

In [18]:
# Issue: The ranks in the data sets are not the same!
list_2 = set(test['rank'])
list_2

{'Builder',
 'Casual scribe',
 'Frequent scribe',
 'Mod',
 'Rookie',
 'Rookie scribe',
 'Star contributor',
 'Super frequent scribe',
 'Super star contributor',
 'Uber contributor',
 'Visitor'}

### Combining the two methods

In [19]:
combined_predict =[]
for i in range(len(test)):
    x1 = P_types_r(test['rank'].iloc[i])
    x2 = P_types_s(test['clean'].iloc[i])
    keys,v1=zip(*x1)
    keys,v2=zip(*x2)
    combined_predict.append(max(list(zip(keys,np.multiply(v1,v2))), key=itemgetter(1))[0])

### Accuracy

In [20]:
results = combined_predict==test.label
sum(results)/len(test)

0.53