In [7]:
import pandas as pd
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
import requests
import sys
import utils
from nltk.stem.porter import PorterStemmer

# Data Preparation

The first step is to convert my data into a form that will make it easier to train classifiers on. I start with the _hygiene.dat.additional_ file since its already in CSV format and the easiest to read. I have converted the categories into an n-hot encoded vector that becomes a part of my features.

In [2]:
features = pd.read_csv("./Hygiene/hygiene.dat.additional", header=None, names=["categories", "pincode", "review_count", "rating"])
features["categories"] = features["categories"].map(eval)

In [3]:
#Reading the labels provided
given_labels = []
with open("./Hygiene/hygiene.dat.labels", "r") as f:
    for i in range(546):
        given_labels.append(int(f.readline()))

print(f"Length: {len(given_labels)}. Sample top 10: {given_labels[:10]}")

Length: 546. Sample top 10: [1, 1, 1, 0, 0, 1, 1, 0, 0, 0]


In [4]:
#Getting the list of unique categories
categories = set()
for c in features["categories"]:
    t = set(c)
    categories = categories.union(t)
categories.remove("Restaurants")

In [5]:
# n-hot encoding for categories
for cat in categories:
    features[cat] = cat in features["categories"]

#dropping the categories column since its not needed anymore
features = features.drop("categories", axis=1)
features

Unnamed: 0,pincode,review_count,rating,Hawaiian,Breakfast & Brunch,Chicken Wings,Barbeque,Senegalese,Italian,Asian Fusion,...,Cheesesteaks,Hot Pot,Shanghainese,American (New),Latin American,Gastropubs,Salvadoran,Fish & Chips,Szechuan,Lebanese
0,98118,4,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,98109,21,4.047619,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,98103,14,3.111111,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,98112,42,4.088889,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,98102,12,3.071429,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13294,98104,1,3.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13295,98116,29,4.258065,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13296,98104,1,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13297,98109,2,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Review Data

The reviews could also potentially provide additional insights into the hygiene of a restaurant. Since we are only interested about hygiene we need to extract the parts of the reviews that talk about hygiene. It is also not necessary that all reviews talk about hygiene. Initially I plan to use a simple method of counting hygiene related words in the review and rely on the rating of the restaurant to indicate whether the hygiene was good or bad. I understand that the rating is not just for the hygiene but I am assuming that if the restaurant was not clean that would be the overiding factor for the reviewer and that will help me classify the negative ones. The positive ones are a little bit trickier. In order to get the words related to hygiene I used the thesaurus.com APIs to get the synonyms and antonyms of the word and treat them as seed words. _The thing to notice about this approach is that it would work as well for any other topic. If we substitute hygiene with service the code would not change at all._

In [6]:
hygiene_rel_words = utils.get_topic_words(["hygiene"])
hygiene_rel_words

{'benefit',
 'cleanliness',
 'contamination',
 'defilement',
 'dirt',
 'dirtiness',
 'filth',
 'filthiness',
 'foulness',
 'generosity',
 'good will',
 'griminess',
 'grubbiness',
 'honesty',
 'hygiene',
 'hygienics',
 'impureness',
 'impurity',
 'morality',
 'pollution',
 'probity',
 'purification',
 'purity',
 'regimen',
 'sanitation',
 'squalor',
 'sterility',
 'uncleanliness',
 'uncleanness',
 'unwholesomeness',
 'wholesomeness'}

Now that er have got our list of seed words we need to stem them and for that we'll use the PorterStemmer implementation in nltk

In [8]:
stemmer = PorterStemmer()
hygiene_rel_words_stem = {stemmer.stem(s) for s in hygiene_rel_words}
hygiene_rel_words_stem

{'benefit',
 'cleanli',
 'contamin',
 'defil',
 'dirt',
 'dirti',
 'filth',
 'filthi',
 'foul',
 'generos',
 'good wil',
 'grimi',
 'grubbi',
 'honesti',
 'hygien',
 'impur',
 'moral',
 'pollut',
 'probiti',
 'purif',
 'puriti',
 'regimen',
 'sanit',
 'squalor',
 'steril',
 'unclean',
 'uncleanli',
 'unwholesom',
 'wholesom'}

Now I'm going to simply count total occurrances of each of the stemmed words in the reviews

In [9]:
with open("./Hygiene/hygiene.dat", "r") as rv_file:
    counts = []
    for line in rv_file:
        c = 0
        for w in hygiene_rel_words_stem:
            p = line.find(w)
            while p != -1:
                c = c + 1
                p = line.find(w, p+1)
        
        counts.append(c)

features["hygiene_word_count"] = counts

In [19]:
(features[features["hygiene_word_count"] != 0]).shape

(2187, 102)

Looks like only 2187/12299 of the reviews seem to discuss hygiene related topics. Lets try training classifiers with this information and see what happens

# Basic Decision Tree trained with given labels

Now I will attempt to train a basic decision tree on the given labels using just these features as input. Not considering the review data yet

In [20]:
training_data = features[:546]
labels = given_labels

classifier = DecisionTreeClassifier()
classifier.fit(training_data, labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [21]:
predictions = classifier.predict(features[546:])

In [22]:
utils.pred_save_submit_to_leaderboard(predictions, "dt.txt")

Submission completed successfully!


This method yielded a Precision of 0.5256 and a Recall of 0.5272. So it was barely above random. Clearly not a good result.

# Boosted Decision tree

This time we train a boosted tree on the same data and see if that does better. AdaBoost is the algorithm used.

In [23]:
from sklearn.ensemble import AdaBoostClassifier

# by default this implementation uses a DecisionTreeClassifier with a max depth of 1. Since we have very few features I didn't want to increase the depth
boost_clf = AdaBoostClassifier()
boost_clf.fit(training_data, labels)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [27]:
utils.pred_save_submit_to_leaderboard(boost_clf.predict(features[546:]), "adaboost.txt")

Submission completed successfully!


This method yielded a Precision of 0.5544 and a Recall of 0.5573. So a small improvement is observed. Next lets also try a random forest. I don't expect the results to be better than AdaBoost since boosting actively tries to fix its mistakes for the subsequent classifiers but I do expect it to do better that a single decision tree.

# Random forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(training_data, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
utils.pred_save_submit_to_leaderboard(rf_clf.predict(features[546:]), "random_forest.txt")

Submission completed successfully!


_Precision: 0.5377, Recall: 0.5418. Exactly as expected, the values are better than a single decision tree but not as good as the boosted tree method_

# Training using unlabelled data

Next lets use some of the methods that we learnt for training models when the amount of unlabelled data is much more than that of the labelled data. 