In [1]:
import csv
import os

def load_data(fileName):
    crtDir =  os.getcwd()
    fileName = os.path.join(crtDir, 'data', fileName)

    data = []
    with open(fileName, mode='r', encoding='latin1') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1

    inputs = [data[i][0] for i in range(len(data))][:100]
    outputs = [data[i][1] for i in range(len(data))][:100]
    labelNames = list(set(outputs))

    return inputs,outputs,labelNames

In [2]:
# prepare data for training and testing

import numpy as np

def separate_data(inputs,outputs):
    np.random.seed(5)
    # noSamples = inputs.shape[0]
    noSamples = len(inputs)
    indexes = [i for i in range(noSamples)]
    trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace = False)
    testSample = [i for i in indexes  if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    testInputs = [inputs[i] for i in testSample]
    testOutputs = [outputs[i] for i in testSample]
    return trainInputs,trainOutputs,testInputs,testOutputs


In [5]:
inputs, outputs, labelNames = load_data('reviews_mixed.csv')
trainInputs,trainOutputs,testInputs,testOutputs = separate_data(inputs, outputs)
sentence = "By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."

In [47]:
# extract some features from the raw text

# # representation 1: Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.transform(testInputs)
testSentence = vectorizer.transform([sentence])

# vocabulary size
print("vocab size: ", len(vectorizer.vocabulary_),  " words")
# no of emails (Samples)
print("traindata size: ", len(trainInputs), " emails")
# shape of feature matrix
print("trainFeatures shape: ", trainFeatures.shape)

# vocabbulary from the train data 
print('some words of the vocab: ', vectorizer.get_feature_names_out()[-20:])
# extracted features
print('some features: ', trainFeatures.toarray()[:3])

vocab size:  341  words
traindata size:  80  emails
trainFeatures shape:  (80, 341)
some words of the vocab:  ['was' 'wasn' 'water' 'we' 'wear' 'well' 'were' 'wet' 'which' 'whole'
 'window' 'windows' 'winter' 'with' 'work' 'working' 'workout' 'would'
 'you' 'your']
some features:  [[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [51]:
# representation 2: tf-idf features - word granularity
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=50)

trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.transform(testInputs)
testSentence = vectorizer.transform([sentence])

# vocabbulary from the train data 
print('vocab: ', vectorizer.get_feature_names_out()[:10])
# extracted features
print('features: ', trainFeatures.toarray()[:3])

vocab:  ['all' 'and' 'are' 'area' 'bathroom' 'bed' 'bit' 'clean' 'cold'
 'comfortable']
features:  [[0.         0.14603507 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.51211449 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.34685183 0.19759403 0.20555329 0.20555329 0.         0.
  0.         0.         0.         0.         0.         0.
  0.3238264  0.         0.17861231 0.         0.         0.
  0.45121804 0.         0.         0.         0.19759403 0.
  0.22560902 0.20555329]
 [0.         0.81777684 0.         0.         0.         0.
  0.         0.57553543 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0

In [8]:
import spacy
nlp = spacy.load('en_core_web_md')

text = "By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."


def text_to_vector_spacy(text):
    doc = nlp(text)
    return doc.vector

trainFeatures = np.array([text_to_vector_spacy(text) for text in trainInputs])
testFeatures = np.array([text_to_vector_spacy(text) for text in testInputs])

print('train data size: ', len(trainInputs))
print('train features shape: ', trainFeatures.shape)


train data size:  80
train features shape:  (80, 300)


In [9]:
# unsupervised classification ( = clustering) of data

from sklearn.cluster import KMeans

unsupervisedClassifier = KMeans(n_clusters=2, random_state=0)
unsupervisedClassifier.fit(trainFeatures)

In [12]:
computedTestIndexes = unsupervisedClassifier.predict(testFeatures)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]
for i in range(0, len(testInputs)):
    print(testInputs[i], " -> ", computedTestOutputs[i])

# sentenceOutput = labelNames[unsupervisedClassifier.predict(testSentence)[0]]
# print()
# print(sentence," -> ",sentenceOutput)

The bed is very comfortable.  ->  positive
Very spacious rooms, quiet and very comfortable.  ->  positive
Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive  ->  negative
walls seem to have no sound insulation  ->  positive
The building was under renovation,  ->  positive
no elevator might be a challenge for some people  ->  positive
The bed was highly uncomfortable, although the engineer fixed it  ->  positive
bed, smell.  ->  positive
Detest the glass "door" if shower/tub .. with?  ->  negative
this was expected, clean towels and room cleaned every day.  ->  positive
More plug outlets with surge protectors.  ->  positive
Room was very spacious  ->  negative
Roof terrace great  ->  negative
No tea or coffee making facilities in the rooms  ->  positive
the room had aircon and we had earplugs and slept soundly.  ->  positive
Also, when the bright bathroom lights are turned on, it lights up the whole hotel room, shining thru the frosted

In [13]:
from sklearn.metrics import accuracy_score

# just supposing that we have the true labels
print("acc: ", accuracy_score(testOutputs, computedTestOutputs))

acc:  0.45
