In [212]:
 # -*- coding: utf-8 -*-

In [213]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import numpy as np
import datetime
import json
import re
import csv
import math
from split_words import split_words, EMOJIS, BULLETS, APPOSTOPHES

In [214]:
def loadGloveModel(gloveFile):
    f = open(gloveFile, 'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print "Trained Model", len(model), " words loaded!"
    return model

In [215]:
model = loadGloveModel('dataset/glove/glove.6B.300d.txt')

Trained Model 400000  words loaded!


In [216]:
# Defining the clean function
def clean_review(original_review):

	review=original_review.lower()


	for k in APPOSTOPHES:
		review = review.replace(k, APPOSTOPHES[k])

	#to remove multiple dots in string
	review = re.sub('\.\.+', '. ', review)

	#split a.a, a.A,a.1 not 1.1
	review = re.sub(r"([a-zA-Z])\.([a-zA-Z0-9])", "\\1. \\2", review)
	review = re.sub(r"([0-9])\.([a-zA-Z])", "\\1 . \\2", review)
	review = re.sub(r"([a-zA-Z])\\([a-zA-Z0-9])", "\\1 or \\2", review)

	review = re.sub('(?!^)([A-Z][a-z]+)', r' \1', review)
	
	#remove multiple occurance of words eg. woooooo=>woo
	review = re.sub(r"([a-zA-Z])\1+", r"\1\1", review)

	review = review.replace("!",'. ')
	review = review.replace(":",' : ')
	review = re.sub(r'\?+', "?", review)

	review=review.replace("\n",". ")
	review=re.sub(' +',' ',review)

	return str(review)

In [217]:
with open('dataset/stopwords.txt') as f:
    content = f.readlines()
stop_words = [x.strip() for x in content]
print len(stop_words)

253


In [218]:
dset = pd.read_csv('dataset/review-dump.csv')
df = pd.read_csv('dataset/video_details.csv')

In [219]:
# Creating data by removing stop_words
li_stop = []
dset['review'] = dset['review'].astype('str')
for x in range(0, len(dset['review'])):
    words = dset['review'][x].split()
    tmp = [word for word in words if word.lower() not in stop_words]
    res = ' '.join(tmp)
    li_stop.append(res)

print len(li_stop)
dset_new = pd.DataFrame(li_stop)
dset_new.columns = ['review']

1969


In [220]:
def train_tfidf(dset):
    porter = nltk.PorterStemmer()
    doc = []
    dset['review'] = dset['review'].astype('str')
    for x in dset['review']:
        x = x.decode('utf-8')
        x.encode('ascii', 'ignore')
        tokens = word_tokenize(x.lower())
        stem = [porter.stem(t) for t in tokens]
        doc.append(' '.join(stem))
    # print doc
    vectorizer = TfidfVectorizer(ngram_range=(1,3))
    vectorizer.fit(doc)
    print len(vectorizer.vocabulary_)
    return vectorizer

vectorizer = train_tfidf(dset)

38325


In [221]:
vectorizer = train_tfidf(dset_new)

26019


In [222]:
def avg_feature_vector(sentence, num_features=1):
    sentence = sentence.lower()
    words = sentence.split()
    feature_vec = np.zeros((1, num_features), dtype='float32')
    n_words = 0
    for word in words:
        if word in model:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if(n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
#     print feature_vec.shape
    return feature_vec

In [223]:
dump = pd.read_csv("dump.csv")
dump.head()

Unnamed: 0,L1,L2,Key,Value,Department,Domain,Subject,Attribute,Tf-idf,Video-GloVe
0,AC,AC didn't cool fast enough,AC,not cool fast,Housekeeping,Maintenance,Appliance,Integrity/Maintenance,,
1,AC,AC remote wasn't available,AC-remote,not available,Housekeeping,Offering,Appliance,Availability,,
2,AC,AC wasn't working at all,AC,not working,Housekeeping,Maintenance,Appliance,Integrity/Maintenance,,
3,AC,Issue not mentioned here,AC,issues,Housekeeping,Offering,Appliance,Readiness,,
4,Breakfast,Breakfast options were limited,Breakfast-options,limited,Food & Beverage,Food Production,Breakfast,Variety,,


In [224]:
clean_txt = []
for x in range(0, len(dump['L2'])):
    tmp = clean_review(dump['L2'][x]).split()
    res = [word.lower() for word in tmp if word.lower() not in stop_words]
    clean_txt.append(' '.join(res))

In [225]:
for x in range(2):
    s = dump['Domain'][x] + " " + dump['Subject'][x] + " " + clean_txt[x]
    print s

Maintenance Appliance  ac not cool fast enough
Offering Appliance  ac remote not available


In [226]:
print len(clean_txt), len(dump['L2'])

46 46


In [260]:
def check_cases(k,v,l1,s):
    if k.lower()=="staff" and v.lower() in ['rude', 'unhelpful']:
        return "MOT - Handling Guest Request At The Desk"
    elif k.lower()=="staff" and v.lower()=="not present":
        return "NOT FOUND"
    elif k.lower()=="room-service" and v.lower()=="delay":
        return "Servicing guest room (Occupied room)"
    elif k.lower() in ["walls", "mentioned room"] and v.lower() in ["seepage", "not allotted"]:
        return "Packing OOS Room Steps"
    elif v.lower() in ["insects", "rodent", "cockroach", "ant", "mosquito"]:
        return "Pest Control (Rodent management, cockroach management, ant management, fly management & mosquito man"
    elif s.lower()=="security":
        return "NOT FOUND"
    elif l1.lower()=="directions":
        return "NOT FOUND"
    elif k.lower()=="hot-water" or k.lower()=="running water" or k.lower()=="ac-remote":
        return "NOT FOUND"
    else:
        return None

In [261]:
def preprocess(inp):
    inp = inp.replace('Toiletry-kit', 'Replenishing supplies')
    inp = inp.replace('Washroom', 'Bathroom')
    inp = inp.replace('water-leakage', 'fittings fixtures check')
    inp = inp.replace('fixtures', 'fixtures check')
    return inp

In [262]:
def match_key(input, video):
    li = []
#     input = input.replace('Washroom', 'bathroom')
    words = set(input.lower().split())
    for x in range(0, len(video)):
        tmp = video[x].replace('Check In', 'check-in')
        token = tmp.lower().split()
        for word in words:
            if word in token:
                li.append(video[x])
                continue
    return li

In [263]:
print match_key("fittings fixtures", df['video_name'])

['Bathroom Cleaning-Washbasin Cleaning(1)(Cleaning SS Fittings and Mirror)', 'Bathtub cleaning-Rinsing (fittings & fixtures & bathtub) & scrubbing (fittings & fixtures)', 'Fittings & fixtures cleaning process', 'Fittings & fixtures cleaning process', 'Fittings & Fixtures Checking process', 'Fittings & Fixtures Checking process']


In [264]:
def get_video_from_similarity2(inp, vectorizer, sent, num):
    cont = []
    porter = nltk.PorterStemmer()
    tk = word_tokenize(sent.lower())
    stem = [porter.stem(t) for t in tk]
    vec2 = vectorizer.transform([' '.join(stem)])
    s1 = "NOT FOUND"
    for x in range(0, len(inp)):
        token = word_tokenize(inp[x])
        words = [w.lower() for w in token if w.lower() not in stop_words]
        stem_token = [porter.stem(t) for t in words]
        vec = vectorizer.transform([' '.join(stem_token)])
        tmp = cosine_similarity(vec, vec2)
#         print tmp[0][0]
        s1 = inp[x]
        cont.append((tmp[0][0], s1))
    
    cont = sorted(cont, reverse=True)
    return cont[:num]

In [265]:
tfidf = []
for x in range(0, len(dump['Key'])):
    k,v,l1,s = dump['Key'][x], dump['Value'][x], dump['L1'][x], dump['Subject'][x]
    flag = check_cases(k,v,l1,s)
    if flag!=None:
        tfidf.append(flag)
    else:
        k = preprocess(k)
        v = preprocess(v)
        l1 = preprocess(l1)
        s = preprocess(s)
        if k != "Check-in":
            k = k.replace("-", " ")
        k = k.replace("/", " ")
        v = v.replace("-", " ")
        v = v.replace("/", " ")
        if s != "Check-in":
            s = s.replace('-', ' ')
        s = s.replace('/', ' ')
        l1 = l1.replace('/', ' ')
    #     print k,s,l1,'\n'
        li = match_key(k+" "+s+" "+l1, df['video_name'])
    #     print li
        res = get_video_from_glovesimilarity2(li, k+" "+v, 1)
        if(len(res)>0):
            tfidf.append(res[0][1])
        else:
            tfidf.append("NOT FOUND")

In [266]:
print len(tfidf)

46


In [267]:
dump['Tf-idf'] = tfidf

In [268]:
def get_video_from_glovesimilarity2(inp, sent, num):
    vec2 = avg_feature_vector(sent)
    cont = []
    for x in range(0, len(inp)):
        token = word_tokenize(inp[x])
        words = [w.lower() for w in token if w.lower() not in stop_words]
        vec = avg_feature_vector(' '.join(words))
        tmp = cosine_similarity(vec, vec2)
        s1 = inp[x]
        cont.append((tmp[0][0], s1))
    cont = sorted(cont, reverse=True)
    return cont[:num]

In [276]:
glove = []
for x in range(0, len(dump['Key'])):
    k,v,l1,s = dump['Key'][x], dump['Value'][x], dump['L1'][x], dump['Subject'][x]
    flag = check_cases(k,v,l1,s)
    if flag != None:
        glove.append(flag)
    else:
        k = preprocess(k)
        v = preprocess(v)
        l1 = preprocess(l1)
        s = preprocess(s)
        if k != "Check-in":
            k = k.replace("-", " ")
        k = k.replace("/", " ")
        v = v.replace("-", " ")
        v = v.replace("/", " ")
        if s != "Check-in":
            s = s.replace('-', ' ')
        s = s.replace('/', ' ')
        l1 = l1.replace('/', ' ')
    #     print k,s,l1,'\n'
        li = match_key(k+" "+s+" "+l1, df['video_name'])
    #     print li
        res = get_video_from_glovesimilarity2(li, k+" "+v, 2)
        if(len(res)>1):
            glove.append(res[1][1])
        elif len(res)==1:
            glove.append(res[0][1])
        else:
            glove.append("NOT FOUND")

In [277]:
for x in range(len(glove)):
    print tfidf[x], "\t||\t", glove[x]

AC filter cleaning- Process & frequency 	||	AC filter cleaning- Process & frequency
NOT FOUND 	||	NOT FOUND
AC filter cleaning- Process & frequency 	||	AC filter cleaning- Process & frequency
AC filter cleaning- Process & frequency 	||	AC filter cleaning- Process & frequency
Breakfast Buffet Replenishment Clean Storage And Live Counter Setup 	||	Breakfast Buffet
MOT - Breakfast Order Guest Says Yes 	||	Breakfast Buffet Replenishment Clean Storage And Live Counter Setup
SERVICE OF PRE PLATED FOOD 	||	SERVICE OF STRAIGHT DRINK
SERVICE OF PRE PLATED FOOD 	||	MOT - Breakfast Order Guest Says Yes
SERVICE OF PRE PLATED FOOD 	||	Breakfast Buffet Replenishment Clean Storage And Live Counter Setup
Breakfast Buffet Sections 	||	Breakfast Buffet
Tab Check In - Allocating A Room & GRC Process 	||	Tab Check In - Upgrade Occupancy
Tab Check In - Allocating A Room & GRC Process 	||	MOT - Guest Does Not Have A Valid ID At Check In
Packing OOS Room Steps 	||	Packing OOS Room Steps
MOT - Handling Multip

In [270]:
print len(glove)

46


In [278]:
dump['Video-GloVe2'] = glove

In [279]:
dump

Unnamed: 0,L1,L2,Key,Value,Department,Domain,Subject,Attribute,Tf-idf,Video-GloVe,Video-GloVe2
0,AC,AC didn't cool fast enough,AC,not cool fast,Housekeeping,Maintenance,Appliance,Integrity/Maintenance,AC filter cleaning- Process & frequency,AC filter cleaning- Process & frequency,AC filter cleaning- Process & frequency
1,AC,AC remote wasn't available,AC-remote,not available,Housekeeping,Offering,Appliance,Availability,NOT FOUND,NOT FOUND,NOT FOUND
2,AC,AC wasn't working at all,AC,not working,Housekeeping,Maintenance,Appliance,Integrity/Maintenance,AC filter cleaning- Process & frequency,AC filter cleaning- Process & frequency,AC filter cleaning- Process & frequency
3,AC,Issue not mentioned here,AC,issues,Housekeeping,Offering,Appliance,Readiness,AC filter cleaning- Process & frequency,AC filter cleaning- Process & frequency,AC filter cleaning- Process & frequency
4,Breakfast,Breakfast options were limited,Breakfast-options,limited,Food & Beverage,Food Production,Breakfast,Variety,Breakfast Buffet Replenishment Clean Storage A...,Breakfast Buffet Replenishment Clean Storage A...,Breakfast Buffet
5,Breakfast,Breakfast wasn't provided,Breakfast,not provided,Food & Beverage,Service,Breakfast,Accuracy,MOT - Breakfast Order Guest Says Yes,MOT - Breakfast Order Guest Says Yes,Breakfast Buffet Replenishment Clean Storage A...
6,Breakfast,Delay in breakfast service,Breakfast-service,delay,Food & Beverage,Service,Staff,Efficiency,SERVICE OF PRE PLATED FOOD,SERVICE OF PRE PLATED FOOD,SERVICE OF STRAIGHT DRINK
7,Breakfast,Food quality wasn't good,Food-quality,not good,Food & Beverage,Food Production,Breakfast,Quality,SERVICE OF PRE PLATED FOOD,SERVICE OF PRE PLATED FOOD,MOT - Breakfast Order Guest Says Yes
8,Breakfast,Food quantity wasn't satisfactory,Food-quantity,not enough,Food & Beverage,Food Production,Breakfast,Size,SERVICE OF PRE PLATED FOOD,SERVICE OF PRE PLATED FOOD,Breakfast Buffet Replenishment Clean Storage A...
9,Breakfast,Issue not mentioned here,Breakfast,issues,Food & Beverage,Offering,Breakfast,Food & Beverage Quality,Breakfast Buffet Sections,Breakfast Buffet Sections,Breakfast Buffet


In [257]:
# def get_comb(inp, vectorizer, sent, num, flag):
#     cont = []
#     ret = []
#     porter = nltk.PorterStemmer()
#     tk = word_tokenize(sent.lower())
#     stem = [porter.stem(t) for t in tk]
#     v = vectorizer.transform([' '.join(stem)])
#     v2 = avg_feature_vector(sent)
#     for x in range(0, len(inp)):
#         token = word_tokenize(inp[x])
#         words = [w.lower() for w in token if w.lower() not in stop_words]
#         stem_token = [porter.stem(t) for t in words]
#         vec = vectorizer.transform([' '.join(stem_token)])
#         vec2 = avg_feature_vector(inp[x])
#         tmp = cosine_similarity(vec, v)
#         tmp2 = cosine_similarity(vec2, v2)
# #         print tmp[0][0]
#         s1 = inp[x]
#         if(flag==1):
#             cont.append((tmp[0][0], s1))
# #         else:
#             cont.append((tmp2[0][0], s1))
#     cont = sorted(cont, reverse=True)
#     return cont[:num]

In [258]:
# comb = []
# for x in range(0, len(dump['Key'])):
#     k,v,s = dump['Key'][x], dump['Value'][x], dump['L1'][x]
#     k = k.replace("-", " ")
#     k = k.replace("/", " ")
#     v = v.replace("-", " ")
#     v = v.replace("/", " ")
#     s = s.replace('-', ' ')
#     s = s.replace('/', ' ')
    
#     li = get_comb(df['video_name'], vectorizer, s+" "+k, 10, 1)
   
#     new_li = []
#     for fi, se in li:
#         new_li.append(se)
# #     print new_li
#     res = get_comb(new_li, vectorizer, k+" "+v, 1, 0)

#     comb.append(res[0][1])

In [280]:
from pandas import ExcelWriter

writer = ExcelWriter('PythonExport.xlsx')
dump.to_excel(writer,'Sheet5')
writer.save()