In [1]:
import csv
import os
import pandas as pd

In [2]:
# Pasul 1 - incarcare date

crtDir =  os.getcwd()
fileName = os.path.join(crtDir, 'data', 'reviews_mixed.csv')

data = []
with open(fileName) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            dataNames = row
        else:
            data.append(row)
        line_count += 1

text = [data[i][0] for i in range(len(data))][:100]
sentiment = [data[i][1] for i in range(len(data))][:100]
labels = list(set(sentiment))

print(text[:2])
print(labels[:2])

['The rooms are extremely small, practically only a bed.', 'Room safe did not work.']
['positive', 'negative']


In [3]:
# Pasul 2 - impartire date (antrenament si test)
import numpy as np

np.random.seed(5)
indexes = [i for i in range(len(text))]
trainSample = np.random.choice(indexes, int(0.8 * len(text)), replace=False)
testSample = [i for i in indexes if not i in trainSample]

trainInputs = [text[i] for i in trainSample]
trainOutputs = [sentiment[i] for i in trainSample]
testInputs = [text[i] for i in testSample]
testOutputs = [sentiment[i] for i in testSample]

print(' - train')
print(trainInputs[:3])
print(trainOutputs[:3])
print(' - test')
print(testInputs[:3])
print(testOutputs[:3])

 - train
['Just to give you an idea: the shutters of the windows were not working, did not go neither up or down - just hanging down only one side and the other up....', 'and hip and CLEAN!', "Toilet paper wasn't replaced everyday!"]
['negative', 'positive', 'negative']
 - test
['The bed is very comfortable.', 'Very spacious rooms, quiet and very comfortable.', 'Corridors filthy\nRoom filthy\nElectrical cables in room not safe\nWhole building smelly\nShower repulsive']
['positive', 'positive', 'negative']


In [5]:
# Pas 3: embedded features extracted by a pre-train model (in fact, word2vec pretrained model)

import gensim 

# Load Google's pre-trained Word2Vec 
crtDir =  os.getcwd()
modelPath = os.path.join(crtDir, 'models', 'GoogleNews-vectors-negative300.bin')

word2vecModel300 = gensim.models.KeyedVectors.load_word2vec_format(modelPath, binary=True) 
print(word2vecModel300.most_similar('support'))
print("vec for house: ", word2vecModel300["house"])

[('supporting', 0.6251285076141357), ('suport', 0.6071150302886963), ('suppport', 0.6053199768066406), ('Support', 0.6044272780418396), ('supported', 0.6009396314620972), ('backing', 0.6007589101791382), ('supports', 0.5269277691841125), ('assistance', 0.5207138061523438), ('sup_port', 0.5192490220069885), ('supportive', 0.5110024809837341)]
vec for house:  [ 1.57226562e-01 -7.08007812e-02  5.39550781e-02 -1.89208984e-02
  9.17968750e-02  2.55126953e-02  7.37304688e-02 -5.68847656e-02
  1.79687500e-01  9.27734375e-02  9.03320312e-02 -4.12109375e-01
 -8.30078125e-02 -1.45507812e-01 -2.37304688e-01 -3.68652344e-02
  8.74023438e-02 -2.77099609e-02  1.13677979e-03  8.30078125e-02
  3.57421875e-01 -2.61718750e-01  7.47070312e-02 -8.10546875e-02
 -2.35595703e-02 -1.61132812e-01 -4.78515625e-02  1.85546875e-01
 -3.97949219e-02 -1.58203125e-01 -4.37011719e-02 -1.11328125e-01
 -1.05957031e-01  9.86328125e-02 -8.34960938e-02 -1.27929688e-01
 -1.39648438e-01 -1.86523438e-01 -5.71289062e-02 -1.176

In [6]:
def featureComputation(model, data):
    features = []
    phrases = [ phrase.split() for phrase in data]
    for phrase in phrases:
        # compute the embeddings of all the words from a phrase (words of more than 2 characters) known by the model
        # vectors = [model[word] for word in phrase if (len(word) > 2) and (word in model.vocab.keys())]
        vectors = [model[word] for word in phrase if (len(word) > 2) and (word in model.index_to_key)]
        if len(vectors) == 0:
            result = [0.0] * model.vector_size
        else:
            result = np.sum(vectors, axis=0) / len(vectors)
        features.append(result)
    return features

trainFeatures = featureComputation(word2vecModel300, trainInputs)
testFeatures = featureComputation(word2vecModel300, testInputs)

In [7]:
# Pasul 4 - antrenare model de invatare nesupervizata (clustering)

# unsupervised classification ( = clustering) of data

from sklearn.cluster import AgglomerativeClustering

# unsupervisedClassifier = KMeans(n_clusters=2, random_state=0)
# unsupervisedClassifier.fit(trainFeatures)

# Perform Agglomerative Hierarchical Clustering
clustering = AgglomerativeClustering(n_clusters=2, linkage='ward')
cluster_labels = clustering.fit_predict(trainFeatures)

In [8]:
# Pasul 5 - testare model

computedTestIndexes = clustering.fit_predict(testFeatures)
computedTestOutputs = [labels[value] for value in computedTestIndexes]
for i in range(0, len(testInputs)):
    print(testInputs[i], " -> ", computedTestOutputs[i])

The bed is very comfortable.  ->  positive
Very spacious rooms, quiet and very comfortable.  ->  positive
Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive  ->  positive
walls seem to have no sound insulation  ->  positive
The building was under renovation,  ->  positive
no elevator might be a challenge for some people  ->  positive
The bed was highly uncomfortable, although the engineer fixed it  ->  positive
bed, smell.  ->  positive
Detest the glass "door" if shower/tub .. with?  ->  positive
this was expected, clean towels and room cleaned every day.  ->  positive
More plug outlets with surge protectors.  ->  positive
Room was very spacious  ->  positive
Roof terrace great  ->  positive
No tea or coffee making facilities in the rooms  ->  positive
the room had aircon and we had earplugs and slept soundly.  ->  positive
Also, when the bright bathroom lights are turned on, it lights up the whole hotel room, shining thru the frosted

In [9]:
# Pasul 6 - calcul metrici de performanta

from sklearn.metrics import accuracy_score

# just supposing that we have the true labels
print("acc: ", accuracy_score(testOutputs, computedTestOutputs))

acc:  0.35


In [10]:
# Pasul 4 - antrenare model de invatare nesupervizata (clustering)

# unsupervised classification ( = clustering) of data

from sklearn.cluster import SpectralClustering

# Perform Agglomerative Hierarchical Clustering
spectral_clustering = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', n_neighbors=10)
cluster_labels = clustering.fit_predict(trainFeatures)

In [11]:
# Pasul 5 - testare model

computedTestIndexes = clustering.fit_predict(testFeatures)
computedTestOutputs = [labels[value] for value in computedTestIndexes]
for i in range(0, len(testInputs)):
    print(testInputs[i], " -> ", computedTestOutputs[i])

The bed is very comfortable.  ->  positive
Very spacious rooms, quiet and very comfortable.  ->  positive
Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive  ->  positive
walls seem to have no sound insulation  ->  positive
The building was under renovation,  ->  positive
no elevator might be a challenge for some people  ->  positive
The bed was highly uncomfortable, although the engineer fixed it  ->  positive
bed, smell.  ->  positive
Detest the glass "door" if shower/tub .. with?  ->  positive
this was expected, clean towels and room cleaned every day.  ->  positive
More plug outlets with surge protectors.  ->  positive
Room was very spacious  ->  positive
Roof terrace great  ->  positive
No tea or coffee making facilities in the rooms  ->  positive
the room had aircon and we had earplugs and slept soundly.  ->  positive
Also, when the bright bathroom lights are turned on, it lights up the whole hotel room, shining thru the frosted

In [14]:
# Pasul 6 - calcul metrici de performanta

from sklearn.metrics import accuracy_score

# just supposing that we have the true labels
print("acc: ", accuracy_score(testOutputs, computedTestOutputs))

acc:  0.65
