In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk

In [66]:
#Read the csv file
df = pd.read_csv("yelp_data_official_training.csv", low_memory=False, delimiter='|')
df.head()

Unnamed: 0,ID,Category,Review Text
0,0,2,Don't waste your time. We had two different p...
1,1,2,I will start by saying we have a nice new deck...
2,2,2,When I wanted a deck for the back of my home I...
3,3,5,Our cat went out the other night and must have...
4,4,5,Greentree Animal clinic is the absolute best! ...


In [67]:
#Take out NaN
filtered_data = df["Review Text"].notnull()
df_filtered = df[filtered_data]

#Shuffle the data
random_index = np.random.permutation(df_filtered.index)
df_shuffled = df.ix[random_index, ['Category', 'Review Text']]
df_shuffled.reset_index(drop=True, inplace=True)
df_shuffled[:5]

Unnamed: 0,Category,Review Text
0,2,Worst service ever!!! Do not go to this compan...
1,4,Bryant Pest Control is great.....the treatment...
2,1,"Found this place via Yelp, decided to go get m..."
3,2,"JSM is timely, polite, friendly, and knowledge..."
4,1,My friend wanted to get her first tattoo so I ...


In [68]:
#Split the data into train and dev
rows, columns = df_shuffled.shape
train_size = round(rows*.7)
dev_size   = round(rows*.3)


df_train = df_shuffled.loc[:train_size]
df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)

## Train the data

In [69]:
#features
def unigram_feature(x, unigram):
    if(type(x) == float):
        print(x)
    word_list = x.lower().split(" ")
    return word_list.count(unigram)

def bigram_feature(x, bigram):
    bigram_tuple = tuple(bigram.split())
    word_list = x.lower().split(" ")
    bi = nltk.FreqDist(nltk.bigrams(word_list))
    return bi[bigram_tuple]

In [70]:
#Running features on train set
train_doctor_feature = df_train['Review Text'].apply(lambda x: unigram_feature(x, ('Dr')))

In [71]:
df_train_features = pd.DataFrame({'doctor': train_doctor_feature})

In [72]:
dev_doctor_feature = df_dev['Review Text'].apply(lambda x: unigram_feature(x, ('Dr')))

In [73]:
df_dev_features = pd.DataFrame({'doctor': dev_doctor_feature})

##Build the model

In [74]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()
nb_model = nb.fit(df_train_features, df_train.Category)
nb_predictions = nb_model.predict(df_dev_features)

accuracy_score(df_dev.Category, nb_predictions)

0.31611111111111112

In [75]:
df_test = pd.read_csv("yelp_data_official_test_nocategories.csv", low_memory=False, delimiter='|')
df_test.head()

Unnamed: 0,ID,Review Text
0,0,We had the same doctor in Houston for over 12 ...
1,1,My mother passed away recently at the age of 9...
2,2,"Updating! So, the dispute was resolved at this..."
3,3,We just moved here from WA and are looking for...
4,4,They have lost the best of the best. The last ...


In [82]:
test_doctor_feature = df_test['Review Text'].apply(lambda x: unigram_feature(x, ('Dr')))
df_test_features = pd.DataFrame({'doctor': test_doctor_feature})
nb_test_predictions = nb_model.predict(df_test_features)
final_test_predictions = pd.DataFrame({'Category': nb_test_predictions})

In [None]:
DataFrame.to_csv('yelp_data_official_test_submission.csv', header=True, index=True,)