In [81]:
## Load data from a csv file, pre-process the content of it and then generate the training and test dataset
## Cargar datos desde un fichero csv, preprocesar sus contenidos y generar el dataset de entreno y testing
import pandas as pd
from pandas import DataFrame

df = pd.read_csv('/home/zan/Downloads/deceptive-opinion.csv')

print(df)

      deceptive             hotel  polarity       source  \
0      truthful            conrad  positive  TripAdvisor   
1      truthful             hyatt  positive  TripAdvisor   
2      truthful             hyatt  positive  TripAdvisor   
3      truthful              omni  positive  TripAdvisor   
4      truthful             hyatt  positive  TripAdvisor   
...         ...               ...       ...          ...   
1595  deceptive  intercontinental  negative        MTurk   
1596  deceptive            amalfi  negative        MTurk   
1597  deceptive  intercontinental  negative        MTurk   
1598  deceptive            palmer  negative        MTurk   
1599  deceptive            amalfi  negative        MTurk   

                                                   text  
0     We stayed for a one night getaway with family ...  
1     Triple A rate with upgrade to view room was le...  
2     This comes a little late as I'm finally catchi...  
3     The Omni Chicago really delivers on all f

In [82]:
df.head(5)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [83]:
## Replace missing values & eliminate duplicated values/etc.
## Sustituir valores nulos/missing y eliminar valores duplicados 
## 1.Data Cleaning: missing data, noisy data
missing_values = ["n/a", "na", "--"]
df.isnull().values.any()
df.isnull().sum() ## No hay valores de tipo missing
df[df.duplicated(keep=False)] ## 803-853, 847-862,  995-1014, 1085-1109
#df = df.drop_duplicates() ## Hay cuatro duplicados en el dataset
## Eliminar datos que no nos interesan para nuestro entrenamiento
## df = df.drop(df.columns[[1, 2, 3]], axis = 1)


#df.iloc[803] == df.iloc[853]
#df.iloc[803].equals(df.iloc[853])

Unnamed: 0,deceptive,hotel,polarity,source,text
803,truthful,omni,negative,Web,My daughter and I woke in the morning wanting ...
847,truthful,omni,negative,Web,The Omni was chosen for it's location whichwor...
853,truthful,omni,negative,Web,My daughter and I woke in the morning wanting ...
862,truthful,omni,negative,Web,The Omni was chosen for it's location whichwor...
995,truthful,affinia,negative,Web,"I'd been searching for a cool, non-chain hotel..."
1014,truthful,affinia,negative,Web,"I'd been searching for a cool, non-chain hotel..."
1085,truthful,monaco,negative,Web,Very disappointed in our stay in Chicago Monoc...
1109,truthful,monaco,negative,Web,Very disappointed in our stay in Chicago Monoc...


In [84]:
## 2.Data transformation: normalization, attribute selection, discretization, hierarchy generation

## Normalizacion de los datos dentro de un cierto intervalo: no es necesario al no tener campos con valores continuos
## Seleccionamos solo aquellas caracteristicas que nos interesen
## Eliminate columns -- 1-3 as they won't contribute to the model
## Eliminar columnas del 1 al 3 ya que no aportan informacion adiccional al modelo

df = df.drop(df.columns[[1, 2, 3]], axis = 1)
df.head(5)


Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...
2,truthful,This comes a little late as I'm finally catchi...
3,truthful,The Omni Chicago really delivers on all fronts...
4,truthful,I asked for a high floor away from the elevato...


In [85]:
## The deceptive and truthful registers are balanced: 800 - 800
## La clase deceptive esta balaneceada, tiene 800 registros deceptive y 800 registros truthful
print(len(df[df['deceptive'] == 'truthful']))
print(len(df[df['deceptive'] == 'deceptive']))

800
800


In [210]:
## 3.Data Reduction: Data Cube aggregation, attribute subset selection, numerosity reduction, dimensionality reduction

## Esta parte nos interesaria para modificar y convertir las opiniones de texto a un vectores de pesos(int) 

In [86]:
df['text'][1500]

"I stayed this hotel for 2 nights. I had high hopes seeing as this is a 4 star hotel and seems quite elegant based on pictures I've seen. Upon entering the building, you can easily see that it is clean and high class. The lobby was nice, and everything seemed as it should be. However, as soon as I made it into my room, it was completely different than everything previous to it. It was dirty, visible yellow stains on the walls and all around the restroom. The room too was not completely organized as all other hotel rooms I've been to. There was also a lingering smell from the last tenants. When I called the front desk about these issues, they sent someone up to fix it. I left the hotel about two hours and upon returning, they were still at it! This is an unreasonable amount of time to fix these glaring errors. The smell also did not completely disappear when they were done. As I was too tired, I just decided to deal with it for the rest of my stay here and focus on enjoying my vacation 

In [261]:
#new_row = {'deceptive': 'truthful', 'text': 'This game action takes place in the medieval era where knights and kings fought along!'}
#df = df.append(new_row, ignore_index=True)
#df2 = pd.DataFrame(["truthful", "We liked this hotel a lot we thoroughly recommend it"], columns=["deceptive", "text"], index=[1600])
#df = df.append(df2)

In [87]:
df

Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...
2,truthful,This comes a little late as I'm finally catchi...
3,truthful,The Omni Chicago really delivers on all fronts...
4,truthful,I asked for a high floor away from the elevato...
...,...,...
1595,deceptive,Problems started when I booked the InterContin...
1596,deceptive,The Amalfi Hotel has a beautiful website and i...
1597,deceptive,The Intercontinental Chicago Magnificent Mile ...
1598,deceptive,"The Palmer House Hilton, while it looks good i..."


## Limpiar texto

In [88]:
from collections import Counter
Counter(" ".join(df["text"]).split()).most_common(7)

[('the', 12772),
 ('and', 7735),
 ('to', 6671),
 ('a', 6312),
 ('I', 5941),
 ('was', 5777),
 ('in', 3587)]

In [89]:
from collections import Counter
from gensim.parsing.preprocessing import strip_multiple_whitespaces

for x in Counter(" ".join(df["text"]).split()).most_common(7):
    df['text'] = df['text'].str.replace(x[0], '')

In [90]:
## Limpiar texto, opinion del usuario para luego poder convertir la secuencia a vector

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation, 
           gsp.strip_multiple_whitespaces, 
           gsp.strip_numeric, 
           #gsp.strip_short,
           #gsp.remove_stopwords, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

df['text'] = df['text'].apply(clean_text)

In [35]:
## Es posible que no sea necesario quitar los stopwords porque convirtiria: I will not be returning(negativo). en --> return(positivo)

In [91]:
df['text'][1596]

'the amlfi hotel hs beuti websit terior decortg but tht s bout it when my wife got here we were given kei room tht hd not even been clene the nternet ccess promis on hotel s websit ws down so couldn t ctch up on ny of busess hd tend do my wife thought tht drk design detil room mde her feel clustrophob like she ws sleepg side slvdor dli ptg all ll thi hotel ws not worth monei especilli sce we spent most of our time enjoyg citi somethg more csul comfortbl would hve been better it would probbl hve been clener o'

In [92]:
## Generate training and testing dataset
training_split = 0.75
testing_split = 0.25
pos_testing_split = 0.9 ## truthful + deceptive, los deceptive entre 5-10% de esta suma.
neg_deceptive_split = 0.1

deceptive_training = df[df.deceptive == 'truthful'].sample(frac = training_split)
newdf = df.drop(deceptive_training.index.values)
pos_deceptive_testing = newdf[newdf.deceptive == 'truthful'].sample(frac = pos_testing_split)
neg_deceptive_testing = newdf[newdf.deceptive == 'deceptive'].sample(int(neg_deceptive_split * (testing_split * len(df[df.deceptive == 'truthful']))))

frames = [pos_deceptive_testing, neg_deceptive_testing]
deceptive_testing = pd.concat(frames)
deceptive_testing = deceptive_testing.sample(frac = 1)

In [268]:
"""
Usada solo para ver el comportamiento del doc2vec
if deceptive_training['text'].str.contains('thi game action take place  mediev era where knight  king fought along').any() == False:
    new_row = {'deceptive': 'truthful', 'text': 'thi game action take place  mediev era where knight  king fought along'}
    deceptive_training = deceptive_training.append(new_row, ignore_index=True)
"""

In [270]:
#deceptive_training['text'].str.contains('thi game action take place  mediev era where knight  king fought along').any()

True

In [93]:
deceptive_training

Unnamed: 0,deceptive,text
906,truthful,gret bed but when first cme request room with ...
1099,truthful,hve sty hotel ll over world thi is probbl wors...
912,truthful,just hd confer re thei hve bed bug hve ll se b...
21,truthful,we went chicgo see n exhibit t art nstitut sel...
938,truthful,from check deprtur poorli run hotel t ok mute ...
...,...,...
74,truthful,from moment step up front entrnc my luxuri ser...
255,truthful,sty t monco on my wy skig t ester love it stff...
1075,truthful,our non smokg room smell veri bdly of stle cig...
1013,truthful,ani trveler who is loyl fvorit hotel hs experi...


In [94]:
deceptive_training['text'][1050]

'over hype over price the fct tht y hve complimentri we recept everi night t doesnt tke wy from fct tht y hve terribl bed plce isnt s clen s it should be for str hotel the hllwy smell like dog prlor crpet my room look like y hdn t been clene well ever the decort pillow on my bed were so filthi y were ctulli sted brown look like y hd been sittg outsid for everyon us n put bck on my bed the decor of hotel ws nice eclect but it wsnt enough convc me ever sty here g the bed ws most uncomfortbl thg hve everi tri sleep on couldnt sleep whole week tht sty re t ws hrd lumpi pillow were ty my fl sy is tht it is n over price holidi nn'

In [128]:
"""
from gensim.utils import simple_preprocess
deceptive_training['text'] = [simple_preprocess(line, deacc=True) for line in deceptive_training['text']]
print(deceptive_training['text'].head(10))
"""

0    [on, night, getai, with, famili, thursdai, tri...
1    [tripl, rate, with, upgrad, view, less, than, ...
2    [thi, come, littl, late, final, catch, review,...
3    [omni, chicago, realli, deliv, all, front, fro...
4    [ask, high, floor, ai, from, elev, that, what,...
5    [omni, on, night, follow, busi, meet, anoth, d...
6    [conrad, night, just, be, thanksgiv, had, corn...
7    [just, got, back, from, dai, chicago, shop, wi...
8    [arriv, omni, septemb, dai, took, ill, when, l...
9    [our, visit, chicago, chose, hyatt, due, it, l...
Name: text, dtype: object


In [95]:
## NLTK tokenizer

import nltk
import gensim
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2: ## solo las palabras con longitud >= 2 se tokenizan
                continue
            tokens.append(word.lower())
    return tokens
train_dec = deceptive_training.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)
test_dec = deceptive_testing.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)

In [70]:
"""
dbow = Doc2Vec(dm=1, vector_size = 200, negative=5, window = 3, min_count = 2, sample = 0, workers = cores)
vec = dbow.infer_vector(df['text'][50])
print(df['text'][50])
print(len(vec))
print("Top 10 values in Doc2Vec inferred vector:")
print(vec[:10])
"""

'\ndbow = Doc2Vec(dm=1, vector_size = 200, negative=5, window = 3, min_count = 2, sample = 0, workers = cores)\nvec = dbow.infer_vector(df[\'text\'][50])\nprint(df[\'text\'][50])\nprint(len(vec))\nprint("Top 10 values in Doc2Vec inferred vector:")\nprint(vec[:10])\n'

In [96]:
train_dec[1108]

TaggedDocument(words=['my', 'sister', 'sty', 'on', 'nd', 'floor', 're', 'ws', 'refrigerr', 'hllwy', 'for', 'first', 'three', 'night', 'bet', 'up', 'old', 'dorm', 'style', 'ws', 'not', 'until', 'we', 'ld', 'concierg', 'tht', 'we', 'were', 'gog', 'post', 'pictur', 'we', 'ok', 'of', 'it', 'on', 'tripdvisor', 'did', 'thg', 'get', 'remov', 'brekfst', 'dish', 'left', 'outsid', 'room', 'will', 'sty', 'on', 'floor', 'for', 'dy', 'or', 'two', 'rout', 'all', 'night', 're', 're', 'chime', 'tht', 'mke', 'nois', 'tht', 're', 'suppos', 'be', 'comformtg', 'or', 'relxg', 'rr', 're', 'just', 'nnoyg', 'there', 'is', 'no', 'wifi', 'no', 'mtter', 'wht', 'tell', 'you', 'pid', 'for', 'wifi', 'still', 'we', 'were', 'unbl', 'get', 'on', 'lstly', 'my', 'sister', 'split', 'bill', 'hd', 'lredi', 'pid', 'for', 'first', 'night', 'form', 'of', 'deposit', 'gve', 'her', 'credit', 'room', 'cost', 'with', 'citi', 'txe', 'etc', 'ws', 'night', 'know', 'comfort', 'nn', 'just', 'few', 'block', 'wy', 'is', 'significntli', '

In [72]:
"""
Utilizado solo en el ejemplo en el que se ha anadido una nueva entrada para ver que entrada le corresponde
count = 0
for x in train_dec.index:
    if x != 1596:
        count +=1
    else:
        break
count
"""

'\nUtilizado solo en el ejemplo en el que se ha anadido una nueva entrada para ver que entrada le corresponde\ncount = 0\nfor x in train_dec.index:\n    if x != 1596:\n        count +=1\n    else:\n        break\ncount\n'

In [97]:
## Build vocabulary using distributed bag of words
from tqdm import tqdm
import multiprocessing
cores = multiprocessing.cpu_count()
dbow = Doc2Vec(dm=1, vector_size = 300, window = 7, min_count = 2, sample = 0, workers = cores)
dbow.build_vocab([x for x in tqdm(train_dec.values)])

100%|██████████| 600/600 [00:00<00:00, 1230001.17it/s]


In [98]:
%%time
from sklearn import utils
for epoch in range(50):
    dbow.train(utils.shuffle([x for x in tqdm(train_dec.values)]), total_examples=len(train_dec.values), epochs=1)
    dbow.alpha -= 0.002
    dbow.min_alpha = dbow.alpha

100%|██████████| 600/600 [00:00<00:00, 1111564.66it/s]
100%|██████████| 600/600 [00:00<00:00, 2130891.11it/s]
100%|██████████| 600/600 [00:00<00:00, 1729609.90it/s]
100%|██████████| 600/600 [00:00<00:00, 1448809.67it/s]
100%|██████████| 600/600 [00:00<00:00, 1409850.08it/s]
100%|██████████| 600/600 [00:00<00:00, 1915207.31it/s]
100%|██████████| 600/600 [00:00<00:00, 1739172.36it/s]
100%|██████████| 600/600 [00:00<00:00, 1605987.49it/s]
100%|██████████| 600/600 [00:00<00:00, 2021351.33it/s]
100%|██████████| 600/600 [00:00<00:00, 1395001.33it/s]
100%|██████████| 600/600 [00:00<00:00, 1737971.27it/s]
100%|██████████| 600/600 [00:00<00:00, 690989.13it/s]
100%|██████████| 600/600 [00:00<00:00, 1657827.67it/s]
100%|██████████| 600/600 [00:00<00:00, 794877.57it/s]
100%|██████████| 600/600 [00:00<00:00, 1447143.42it/s]
100%|██████████| 600/600 [00:00<00:00, 1233014.40it/s]
100%|██████████| 600/600 [00:00<00:00, 658963.71it/s]
100%|██████████| 600/600 [00:00<00:00, 1723686.58it/s]
100%|████████

In [99]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [100]:
y_train, x_train = vec_for_learning(dbow, train_dec)
y_test, x_test = vec_for_learning(dbow, test_dec)

In [77]:
dbow.most_similar('like')

[('pretty', 0.6873006224632263),
 ('done', 0.6788420677185059),
 ('sub-par', 0.6762683391571045),
 ('almost', 0.6740660667419434),
 ('freshly', 0.6658573150634766),
 ('which', 0.6627627015113831),
 ('although', 0.6531490087509155),
 ('what', 0.6520970463752747),
 ('stupid', 0.6519065499305725),
 ('favor', 0.6517404317855835)]

In [101]:
len(x_train[260])

300

In [102]:
import numpy as np

training_list = []
for tag, text_val in zip(y_train, x_train):
    listToStr = ' '.join([str(elem) for elem in text_val]) 
    training_list.append([tag,listToStr])

testing_list = []
for tag, text_val in zip(y_test, x_test):
    listToStr = ' '.join([str(elem) for elem in text_val]) 
    testing_list.append([tag,listToStr])

deceptive_training = pd.DataFrame(data=training_list, columns=["deceptive","text"])
deceptive_testing = pd.DataFrame(data=testing_list, columns=["deceptive","text"])
type(deceptive_training['text'])

pandas.core.series.Series

In [49]:
deceptive_training.to_csv(r'/home/zan/Desktop/dl_autoencoder/docs/autoencoder/deceptive_opinion_autoencoder/deceptive_training.csv', index=False, sep=',')
deceptive_testing.to_csv(r'/home/zan/Desktop/dl_autoencoder/docs/autoencoder/deceptive_opinion_autoencoder/deceptive_testing.csv', index=False, sep=',')

## Files without/partially preprocessing

In [103]:
deceptive_training.to_csv(r'/home/zan/Desktop/dl_autoencoder/docs/autoencoder/deceptive_opinion_autoencoder/deceptive_training_sin_proc.csv', index=False, sep=',')
deceptive_testing.to_csv(r'/home/zan/Desktop/dl_autoencoder/docs/autoencoder/deceptive_opinion_autoencoder/deceptive_testing_sin_proc.csv', index=False, sep=',')