In [1]:
 # -*- coding: utf-8 -*-

In [9]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import numpy as np
import datetime

##### Loading the dataset having 1969 reviews

In [3]:
dset = pd.read_csv('dataset/review-dump.csv')
dset.head()

Unnamed: 0,review,cat
0,MUCH better than the West Hollywood property w...,Hotel
1,Stay away from room service in my opinion....,Hotel
2,Room service was superb.,Hotel
3,Stayed in a king suite for 11 nights and yes i...,Hotel
4,The location close to the 72nd Street subway s...,Hotel


### Training the vectorizer on Reviews Dataset (1969 reviews)

In [15]:
def train_tfidf(dset):
    porter = nltk.PorterStemmer()
    doc = []
    dset['review'] = dset['review'].astype('str')
    for x in dset['review']:
        x = x.decode('utf-8')
        x.encode('ascii', 'ignore')
        tokens = word_tokenize(x)
        stem = [porter.stem(t) for t in tokens]
        doc.append(' '.join(stem))
    # print doc
    vectorizer = TfidfVectorizer(ngram_range=(1,3))
    vectorizer.fit(doc)
    print len(vectorizer.vocabulary_)
    return vectorizer

#### This returns the most similar video with the sentence

In [8]:
def get_video_from_similarity(df, vectorizer, sent, vdo_dict, watchlist):
    porter = nltk.PorterStemmer()
    tk = word_tokenize(sent)
    stem = [porter.stem(t) for t in tk]
    vec2 = vectorizer.transform([' '.join(stem)])
    cosim, euclid, i1, i2 = 0.0, float('inf'), -1, -1
    s1, s2 = "", ""
    for x in range(0, len(df['video_name'])):
        token = word_tokenize(df['video_name'][x])
        words = [w.lower() for w in token]
        stem_token = [porter.stem(t) for t in words]
        vec = vectorizer.transform([' '.join(stem_token)])
        tmp = cosine_similarity(vec, vec2)
        if tmp > cosim:
            cosim = tmp
            s1 = df['video_name'][x]
            i1 = df['id'][x]
        tmp2 = euclidean_distances(vec, vec2)
        if tmp2 < euclid:
            euclid = tmp2
            s2 = df['video_name'][x]
            i2 = df['id'][x]

    vdo_dict[i1] = [sent, s1, df['video_url'][df['id']==i1].values]
    vdo_dict[i2] = [sent, s2, df['video_url'][df['id']==i2].values]


In [16]:
df = pd.read_csv('dataset/video_details.csv')
vdo_df = pd.read_csv('dataset/video.csv')
review = pd.read_csv('dataset/review.csv')
watchlist = pd.read_csv('dataset/watchlist.csv')

In [12]:
video_watch_limit = 45
till_date = (datetime.date.today() - datetime.timedelta(video_watch_limit)).isoformat()
watchlist = watchlist[((watchlist['is_watched']==1) & (watchlist['updated_at']<=till_date) | (watchlist['is_watched']==0))]

In [13]:
df = df[df['id'].isin(watchlist['video_details_id_id'].values)]
df.reset_index(drop=True, inplace=True)

In [18]:
vectorizer = train_tfidf(dset)

38325


In [19]:
vdo_dict = {}
sentiment = -0.92

In [22]:
for x in range(0, 7):
    if sentiment <= 0.0:
        get_video_from_similarity(df, vectorizer, review['review'][x], vdo_dict, watchlist)

print "Length of dic is ", len(vdo_dict), '\n'
for k,v in vdo_dict.items():
    print k, " -> ", v, '\n'

Length of dic is  8 

128  ->  ['The quality of hotel is bad', 'CLEARANCE OF PLATES', array(['http://d255tx56tiemkm.cloudfront.net/Cafe+Videos/Cafe+Service/CLEARANCE+OF+PLATES.mp4'],
      dtype=object)] 

138  ->  ['The room was not according to expectation', 'Preparing to clean the room process', array(['http://d255tx56tiemkm.cloudfront.net/GRA+Videos/Room+Cleaning/Preparing+To+Clean+The+Room/Hk-preparing+To+Clean+The+Room+5march.m4v'],
      dtype=object)] 

87  ->  ['Extra bedsheets were not there', 'Extra Bed Placement Process', array(['http://d255tx56tiemkm.cloudfront.net/GRA+Articulate+Made+Videos/Extra+Bed+Placement/Extra+Bed+Placement+09.03.18.mp4'],
      dtype=object)] 

12  ->  ['there was smell coming from bathroom', 'Bathroom Cleaning-Replenishing Supplies & Mopping', array(['http://d255tx56tiemkm.cloudfront.net/GRA+Videos/Bathroom+Cleaning/Replenishing+Supplies+%26+Mopping/Hk-replenishing+Supplies+%26+Mopping%5B1%5D+5march-1.m4v'],
      dtype=object)] 

45  ->  ['AC was

### Working on Cause Text

In [129]:
tag = pd.read_csv('dataset/tags.csv')
cause = pd.read_csv('dataset/cause.csv')
solver = pd.read_csv('dataset/solver.csv')

tag_id = tag['id'][tag.subject.isin(['Appliances']) & tag.attribute.isin(['Availability']) & tag.domain.isin(['Room Making']) & tag.department.isin(['Housekeeping'])].values
print tag_id

cause_id = set(solver['cause_id_id'][solver['tag_id_id'].isin(tag_id)].values)
print (cause_id)

[7]
set([35, 36, 37, 38])


In [132]:
cause_doc = []
# cause_doc.append(cause['causetext'][cause.id.isin(cause_id)].values)
for x in cause_id:
    cause_doc.append(cause['causetext'].values[cause.id==(x)])
    
print cause_doc

[array(['This item is not a part of a offering'], dtype=object), array(['This item was Out Of Order'], dtype=object), array(['This item was being used by another guest'], dtype=object), array(['Staff not aware that this item is a part of the offering'],
      dtype=object)]


In [35]:
# df.head()

In [39]:
porter = nltk.PorterStemmer()
for cs in cause_doc:
    token = word_tokenize(cs[0])
    stem = [porter.stem(t) for t in token]
    vec2 = vectorizer.transform([' '.join(stem)])
    cosim, i1 = 0.0, -1.0
    s1 = ""
    for x in range(0, len(df['video_name'])):
        tok = word_tokenize(df['video_name'][x])
        stem_token = [porter.stem(t) for t in tok]
        vec = vectorizer.transform([' '.join(stem_token)])
        tmp = cosine_similarity(vec, vec2)
        if tmp > cosim:
            cosim = tmp
            s1 = df['video_name'][x]
            i1 = df['id'][x]
    
    print cs[0], "->", s1, i1

Staff was untrained ->  -1.0
Did not receive linen on time from the vendor -> Linen Exchange & Discard Process (Handing over soiled linen & Receiving fresh linen) 8
Fresh linen not available -> Linen Exchange & Discard Process (Handing over soiled linen & Receiving fresh linen) 8
Did not receive the right quantity of linen back from the vendor -> Linen Exchange & Discard Process (Handing over soiled linen & Receiving fresh linen) 8
Room not checked by GRE/HM -> MOT - Check Out 70


### Now training on GloVe embedding

In [40]:
def loadGloveModel(gloveFile):
    f = open(gloveFile, 'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print "Trained Model", len(model), " words loaded!"
    return model

In [41]:
model = loadGloveModel('dataset/glove/glove.6B.50d.txt')

Trained Model 400000  words loaded!


In [43]:
print model['room']

[ 0.51518   0.80125  -0.13731  -0.472     1.0321   -0.75538  -0.58585
 -0.10406  -0.22021  -0.38029  -0.82568  -0.1288   -0.059862  0.8529
  0.54697   0.43243  -0.54769   0.35936  -0.14251  -1.2086    0.72885
  1.0991   -0.34049   0.014483 -0.20405  -0.98005  -0.07667   1.0827
  0.34461  -0.37714   2.8916    0.23911  -0.091089 -0.45495   0.24013
  0.92777   0.77564   0.37424   0.84257  -0.34445   0.049718  0.27486
 -0.35371   1.0032    0.081324  0.25981   0.17708  -1.1572   -0.080012
  0.08214 ]


In [113]:
with open('dataset/stopwords.txt') as f:
    content = f.readlines()
stop_words = [x.strip() for x in content]
print len(stop_words)

253


In [48]:
def avg_feature_vector(sentence, num_features=50):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in model:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if(n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec
            

In [54]:
s1 = avg_feature_vector('this is a sentence')
s2 = avg_feature_vector('this is also a sentence')

from scipy import spatial
sim = 1 - spatial.distance.cosine(s1, s2)
print sim

0.9922950354472008


In [141]:
def get_video_from_glovesimilarity(df, sent, vdo):
    vec2 = avg_feature_vector(sent)
    cosim, i1 = 0.0, -1
    s1, s2 = "", ""
    for x in range(0, len(df['video_name'])):
        vec = avg_feature_vector(df['video_name'][x])
        tmp = 1 - spatial.distance.cosine(vec, vec2)
        if tmp > cosim:
            cosim = tmp
            s1 = df['video_name'][x]
            i1 = df['id'][x]
    print i1, sent, tmp
    vdo[i1] = [sent, s1, df['video_url'][df['id']==i1].values]

In [142]:
vdo = {}
for x in range(0, 7):
    if sentiment <= 0.0:
        get_video_from_glovesimilarity(df, review['review'][x], vdo)

print "Length of dic is ", len(vdo), '\n'
for k,v in vdo.items():
    print k, " -> ", v, '\n'

86 The room service is bad 0.6477380624640072
86 The quality of hotel is bad 0.6605280783037595
91 The room was not according to expectation 0.6077790295183275
86 minibar was not in the room 0.6489408303757652
91 AC was not working 0.5981857433961429
91 Extra bedsheets were not there 0.5157426543209019
86 there was smell coming from bathroom 0.583887136575734
Length of dic is  2 

91  ->  ['Extra bedsheets were not there', 'Pest Control (Records to be maintained)', array(['http://d255tx56tiemkm.cloudfront.net/GRA+Articulate+Made+Videos/Pest+Control/Pest+Control+2+09.03.18.mp4'],
      dtype=object)] 

86  ->  ['there was smell coming from bathroom', 'Occupied room set up (Things to do in an occupied room & bathroom)', array(['http://d255tx56tiemkm.cloudfront.net/GRA+Articulate+Made+Videos/Difference+In+Services/Difference+in+services-occupied+room+setup+09.03.18.mp4'],
      dtype=object)] 



In [None]:
vdo2 = {}
for x in range(0, 7):
    if sentiment <= 0.0:
        sent = review['review'][x].split()
        resultwords  = [word for word in sent if word.lower() not in stop_words]
        result = ' '.join(resultwords)
        get_video_from_glovesimilarity(df, result, vdo2, watchlist)

print "Length of dic is ", len(vdo2), '\n'
for k,v in vdo2.items():
    print k, " -> ", v, '\n'

In [111]:
for cs in cause_doc:
    li = []
    vec2 = avg_feature_vector(cs[0])
    for x in range(0, len(df['video_name'])):
        vec = avg_feature_vector(df['video_name'][x])
        tmp = 1 - spatial.distance.cosine(vec, vec2)
        if(math.isnan(tmp)):
            continue
        i1 = df['id'][x]
        li.append((tmp,i1))
    li = sorted(li, reverse=True)
    li = li[:3]
    print cs[0]
    for f,s in li:
        print df['video_name'][df['id']==s].values
    print '\n'
        

Staff was untrained
['Pest Control (Records to be maintained)']
['Occupied room set up (Things to do in an occupied room & bathroom)']
['Check out call process (When it should be performed & objective of this process)']


Did not receive linen on time from the vendor
['Occupied room set up (Things to do in an occupied room & bathroom)']
['Vacant room set up (Things to do in a vacant room & bathroom)']
['Starting the day (starting the day, check grooming, points to remember, GRA uniform)']


Fresh linen not available
['Vacant room set up (Things to do in a vacant room & bathroom)']
['Check out call process (When it should be performed & objective of this process)']
['Pest Control (Records to be maintained)']


Did not receive the right quantity of linen back from the vendor
['Starting the day (starting the day, check grooming, points to remember, GRA uniform)']
['Occupied room set up (Things to do in an occupied room & bathroom)']
['Pest Control (Records to be maintained)']


Room not c

### Using Glove embedding on sentence after removing stopwords

In [124]:
for cs in cause_doc:
    li = []
    words = cs[0].split()
    resultwords  = [word for word in words if word.lower() not in stop_words]
    result = ' '.join(resultwords)
#     print cs[0]
#     print result
    vec2 = avg_feature_vector(result)
    for x in range(0, len(df['video_name'])):
        reswords = [word for word in df['video_name'][x].split() if word.lower() not in stop_words]
        res = ' '.join(reswords)
        vec = avg_feature_vector(res)
        tmp = 1 - spatial.distance.cosine(vec, vec2)
        if(math.isnan(tmp)):
            continue
        i1 = df['id'][x]
        li.append((tmp,i1))
    li = sorted(li, reverse=True)
    li = li[:3]
    print cs[0], "->", result
    for f,s in li:
        print df['video_name'][df['id']==s].values
    print '\n'

Staff was untrained -> Staff untrained
['Pest Control (Rodent management, cockroach management, ant management, fly management & mosquito man']
['soliciting feedback Cafe']
['Bathtub cleaning- Bathtub & corners scrubbing']


Did not receive linen on time from the vendor -> not receive linen time vendor
['Reception Area cleaning (Paintings, switch plate & coffee table cleaning)']
['order taking repeating & closing an order']
['Preparing to clean the room process']


Fresh linen not available -> Fresh linen not available
['Reception Area cleaning (Paintings, switch plate & coffee table cleaning)']
['Preparing to clean the room process']
['Room Layout & Amenities (Writing table contents & amenities basket)']


Did not receive the right quantity of linen back from the vendor -> not receive right quantity linen back vendor
['order taking repeating & closing an order']
['Reception Area cleaning (Paintings, switch plate & coffee table cleaning)']
['Preparing to clean the room process']


Room