In [88]:
import pandas as pd
import sklearn.ensemble as ske
from sklearn.tree import DecisionTreeClassifier
import sklearn
import utils
import numpy as np
import nltk
import boto3
from scipy.sparse import csr_matrix

In [2]:
def calculate_f1_score(predicted_labels, actual_labels):
    pos_mask = predicted_labels == 1
    rec_mask = actual_labels == 1
    precision = np.sum(actual_labels[pos_mask]) / len(predicted_labels[pos_mask])
    recall = np.sum(predicted_labels[rec_mask]) / len(actual_labels[rec_mask])
    print(str(precision) + " "  + str(recall))
    return 2.0 * precision * recall / (precision + recall)

In [3]:
comprehend = boto3.client(service_name='comprehend', region_name='ap-southeast-1')

def get_sentiment_score_aws(sentence_list):
    score = 0
    for sentence in sentence_list:
        result = comprehend.detect_sentiment(Text=sentence, LanguageCode='en')
        sentiment_info = result["SentimentScore"]
        if result["Sentiment"] == "POSITIVE":
            score = score + sentiment_info["Positive"]
        elif result["Sentiment"] == "NEGATIVE":
            score = score + sentiment_info["Negative"] * -1
        elif result["Sentiment"] == "NEUTRAL":
            score = score + sentiment_info["Neutral"] * 0.3
        else:
            score = score + 0
            #ignore the cases where the sentiment is mixed
        
    return score

# Data Preparation

The first step is to convert my data into a form that will make it easier to train classifiers on. I start with the _hygiene.dat.additional_ file since its already in CSV format and the easiest to read. I have converted the categories into an n-hot encoded vector that becomes a part of my features.

In [4]:
features = pd.read_csv("./Hygiene/hygiene.dat.additional", header=None, names=["categories", "pincode", "review_count", "rating"])
features["categories"] = features["categories"].map(eval)

In [5]:
#Reading the labels provided
given_labels = []
with open("./Hygiene/hygiene.dat.labels", "r") as f:
    for i in range(546):
        given_labels.append(int(f.readline()))

print(f"Length: {len(given_labels)}. Sample top 10: {given_labels[:10]}")

Length: 546. Sample top 10: [1, 1, 1, 0, 0, 1, 1, 0, 0, 0]


In [6]:
#Getting the list of unique categories
categories = set()
for c in features["categories"]:
    t = set(c)
    categories = categories.union(t)
categories.remove("Restaurants")

In [7]:
# n-hot encoding for categories
for cat in categories:
    features[cat] = cat in features["categories"]

#dropping the categories column since its not needed anymore
features = features.drop("categories", axis=1)
features

Unnamed: 0,pincode,review_count,rating,Modern European,Gluten-Free,Belgian,Irish,Indonesian,Australian,Afghan,...,African,Chinese,Mongolian,Soup,Szechuan,Southern,Tapas Bars,Cafes,Cajun/Creole,Asian Fusion
0,98118,4,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,98109,21,4.047619,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,98103,14,3.111111,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,98112,42,4.088889,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,98102,12,3.071429,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13294,98104,1,3.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13295,98116,29,4.258065,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13296,98104,1,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13297,98109,2,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Review Data

The reviews could also potentially provide additional insights into the hygiene of a restaurant. Since we are only interested about hygiene we need to extract the parts of the reviews that talk about hygiene. It is also not necessary that all reviews talk about hygiene. Initially I plan to use a simple method of counting hygiene related words in the review and rely on the rating of the restaurant to indicate whether the hygiene was good or bad. I understand that the rating is not just for the hygiene but I am assuming that if the restaurant was not clean that would be the overiding factor for the reviewer and that will help me classify the negative ones. The positive ones are a little bit trickier. In order to get the words related to hygiene I used the powerthesaurus.org to get the synonyms and antonyms of the word and treat them as seed words.

In [8]:
hygiene_rel_words = {"sanitation", "salubrity", "sanitary", "hygienic", "tidiness", "sterility", "disinfection", "filth", "uncleanliness", "dirt", "garbage", "muck", "clean", "sterile", "dirty", "hygiene"}

Getting the sentiment scores from the AWS Comprehend API

In [9]:
def sent_tokenise_reviews(review_txt):
    regex_tk = nltk.tokenize.RegexpTokenizer(pattern=r"[.?!;,\n]+" ,gaps=True, discard_empty=True)
    sentences = [s.strip() for s in regex_tk.tokenize(review_txt)]
    
    return sentences

scores = []
with open("./Hygiene/hygiene.dat", "r") as rv_file:
    for line in rv_file:
        c = 0
        sentences = sent_tokenise_reviews(line)
        hygiene_rel_sent = []
        for w in hygiene_rel_words:
            for s in sentences:
                if w in s:
                    hygiene_rel_sent.append(s)
        
        scores.append(get_sentiment_score_aws(hygiene_rel_sent))

features["hygiene_sentiment"] = scores

In [112]:
type(False) == type(False)

True

In [151]:
features = pd.read_csv("./output/data.csv", index_col=0)
features

Unnamed: 0,pincode,review_count,rating,Szechuan,Indonesian,Pakistani,Brazilian,Fondue,Latin American,Halal,...,Chinese,Spanish,Cambodian,Ethiopian,Vegan,Cajun/Creole,Sandwiches,Kosher,Salvadoran,hygiene_sentiment
0,98118,4,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
1,98109,21,4.047619,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
2,98103,14,3.111111,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
3,98112,42,4.088889,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.879902
4,98102,12,3.071429,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13294,98104,1,3.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
13295,98116,29,4.258065,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
13296,98104,1,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000
13297,98109,2,4.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.000000


In [11]:
with open("./Hygiene/hygiene.dat", "r", encoding="utf-8") as rv_file:
    reviews = pd.Series([line.strip() for line in rv_file])

with open("stopwords.txt", "r", encoding="utf-8") as st:
    stopwords = {line.strip() for line in st}

In [152]:
vec = sklearn.feature_extraction.text.CountVectorizer(stop_words=stopwords, max_features=1000)
word_feat = vec.fit_transform(reviews)

In [153]:
features = np.hstack((word_feat.toarray(), features.to_numpy()))
features

array([[0, 0, 1, ..., False, False, 0.0],
       [1, 0, 1, ..., False, False, 0.0],
       [2, 0, 1, ..., False, False, 0.0],
       ...,
       [0, 0, 0, ..., False, False, 0.0],
       [0, 0, 0, ..., False, False, 0.0],
       [0, 0, 0, ..., False, False, 0.0]], dtype=object)

for i, col in enumerate(vec.get_feature_names()):
    features[col] = pd.Series(word_feat[:, i].toarray().ravel())

# Basic Decision Tree trained with given labels

Now I will attempt to train a basic decision tree on the given labels using just these features as input. Not considering the review data yet

In [90]:
training_data = features[:546]
labels = pd.Series(given_labels)

classifier = DecisionTreeClassifier()
classifier.fit(training_data, labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [91]:
calculate_f1_score(classifier.predict(training_data), labels)

1.0 1.0


1.0

In [26]:
predictions = classifier.predict(features[546:])

In [17]:
utils.pred_save_submit_to_leaderboard(predictions, "dt.txt")

Submission completed successfully!


# Boosted Decision tree

This time we train a boosted tree on the same data and see if that does better. AdaBoost is the algorithm used.

In [102]:
# by default this implementation uses a DecisionTreeClassifier with a max depth of 1. Since we have very few features I didn't want to increase the depth
boost_clf = ske.AdaBoostClassifier(n_estimators=350)
boost_clf.fit(features[:546], labels)

KeyboardInterrupt: 

In [93]:
utils.pred_save_submit_to_leaderboard(boost_clf.predict(features[546:]), "adaboost.txt")

Submission completed successfully!


This method yielded a Precision of 0.5544 and a Recall of 0.5573. So a small improvement is observed. Next lets also try a random forest. I don't expect the results to be better than AdaBoost since boosting actively tries to fix its mistakes for the subsequent classifiers but I do expect it to do better that a single decision tree.

# Random forest

In [144]:
rf_clf = ske.RandomForestClassifier(n_estimators=700)
rf_clf.fit(features[:546], labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [145]:
utils.pred_save_submit_to_leaderboard(rf_clf.predict(features[546:]), "random_forest.txt")

Submission completed successfully!


# Bagging classifier

In [154]:
bagg_clf = ske.BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=700)
bagg_clf.fit(features[:546], labels)
#calculate_f1_score(bagg_clf.predict(features[:546]), labels)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [155]:
utils.pred_save_submit_to_leaderboard(bagg_clf.predict(features[546:]), "bagg_dt.txt")

Submission completed successfully!


# Naive Bayes

In [172]:
from sklearn.naive_bayes import GaussianNB
naive_b = GaussianNB()
naive_b.fit(training_data, labels)

utils.pred_save_submit_to_leaderboard(naive_b.predict(features[546:]), "nb.txt")

Submission completed successfully!


# Logistic Regression

In [142]:
lr_clf = sklearn.linear_model.LogisticRegression()
lr_clf.fit(features[:546], labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [143]:
utils.pred_save_submit_to_leaderboard(lr_clf.predict(features[546:]), "lr.txt")

Submission completed successfully!


# Training using unlabelled data

Next lets use some of the methods that we learnt for training models when the amount of unlabelled data is much more than that of the labelled data. Here I'll use the method of running a trained classfier on the unlabelled data and then adding the ones with the highest confidence back to the training set. Also, I'll use the records which have _hygiene_word_count_ > 0 but I'll make sure not add more than 50% of the total records to the training set to prevent overfitting.

In [65]:
unlabelled_data = features[546:]
unlabelled_data_for_training = unlabelled_data[unlabelled_data["hygiene_sentiment"] > 0]
unlabelled_data_for_training.shape

(4150, 102)

Since the total number of unlabelled examples I want to use for training is 13300/2 = 6650, I need to add 6650-4150=2500 samples to this set from the unlabelled data

In [46]:
training_data = features[:546]
unlabelled_data = features[546:]
unlabelled_data_for_training = unlabelled_data[unlabelled_data["hygiene_sentiment"] < 0]
unlabelled_data_for_training = unlabelled_data_for_training.append(unlabelled_data[unlabelled_data["hygiene_sentiment"] >= 0].sample(2500))
curr_training_set = training_data
curr_labels = pd.Series(given_labels)

print(f"Initial training data: {training_data.shape}, Type: {type(training_data)}")

iterations = 0

self_train_clf = ske.RandomForestClassifier(n_estimators=700)
#self_train_clf = ske.BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=150)

while unlabelled_data_for_training.shape[0] > 0:
    self_train_clf.fit(curr_training_set, curr_labels)
    probs = self_train_clf.predict_proba(unlabelled_data_for_training)
    results = self_train_clf.predict(unlabelled_data_for_training)
    high_conf = (probs[:,0] >= 0.8) | (probs[:,1] >= 0.8)

    if len(results[high_conf]) < 10:
        print("Terminating")
        break
    curr_training_set = curr_training_set.append(unlabelled_data_for_training[high_conf])
    curr_labels = curr_labels.append(pd.Series(results[high_conf]))

    unlabelled_data_for_training = unlabelled_data_for_training[np.invert(high_conf)]
    iterations = iterations + 1
    print(f"iter: {iterations} Remaining unlabelled data: {unlabelled_data_for_training.shape[0]}")

Initial training data: (546, 1101), Type: <class 'pandas.core.frame.DataFrame'>
iter: 1 Remaining unlabelled data: 3804
iter: 2 Remaining unlabelled data: 3619
iter: 3 Remaining unlabelled data: 3377
iter: 4 Remaining unlabelled data: 3137
iter: 5 Remaining unlabelled data: 2934
iter: 6 Remaining unlabelled data: 2733
iter: 7 Remaining unlabelled data: 2562
iter: 8 Remaining unlabelled data: 2379
iter: 9 Remaining unlabelled data: 2263
iter: 10 Remaining unlabelled data: 2154
iter: 11 Remaining unlabelled data: 2059
iter: 12 Remaining unlabelled data: 1982
iter: 13 Remaining unlabelled data: 1899
iter: 14 Remaining unlabelled data: 1845
iter: 15 Remaining unlabelled data: 1803
iter: 16 Remaining unlabelled data: 1759
iter: 17 Remaining unlabelled data: 1710
iter: 18 Remaining unlabelled data: 1668
iter: 19 Remaining unlabelled data: 1606
iter: 20 Remaining unlabelled data: 1563
iter: 21 Remaining unlabelled data: 1530
iter: 22 Remaining unlabelled data: 1479
iter: 23 Remaining unlabell

Now lets use this classifier to make predictions... It should be significantly better than the previous ones

In [47]:
predictions = self_train_clf.predict(features[546:])

In [48]:
utils.pred_save_submit_to_leaderboard(predictions, "self_train.txt")

Submission completed successfully!
