In [356]:
df2 = pd.DataFrame({"deceptive": "truthful", "text": "This hotel has been great! We recommend it a lot."}, index = [1600])
df = df.append(df2)

In [366]:
## Load data from a csv file, pre-process the content of it and then generate the training and test dataset
## Cargar datos desde un fichero csv, preprocesar sus contenidos y generar el dataset de entreno y testing
import pandas as pd
from pandas import DataFrame

df = pd.read_csv('/home/zan/Downloads/deceptive-opinion.csv')

print(df)

      deceptive             hotel  polarity       source  \
0      truthful            conrad  positive  TripAdvisor   
1      truthful             hyatt  positive  TripAdvisor   
2      truthful             hyatt  positive  TripAdvisor   
3      truthful              omni  positive  TripAdvisor   
4      truthful             hyatt  positive  TripAdvisor   
...         ...               ...       ...          ...   
1595  deceptive  intercontinental  negative        MTurk   
1596  deceptive            amalfi  negative        MTurk   
1597  deceptive  intercontinental  negative        MTurk   
1598  deceptive            palmer  negative        MTurk   
1599  deceptive            amalfi  negative        MTurk   

                                                   text  
0     We stayed for a one night getaway with family ...  
1     Triple A rate with upgrade to view room was le...  
2     This comes a little late as I'm finally catchi...  
3     The Omni Chicago really delivers on all f

In [367]:
df.head(5)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [368]:
## Replace missing values & eliminate duplicated values/etc.
## Sustituir valores nulos/missing y eliminar valores duplicados 
## 1.Data Cleaning: missing data, noisy data
missing_values = ["n/a", "na", "--"]
df.isnull().sum() ## No hay valores de tipo missing
df[df.duplicated(keep=False)] ## 803-853, 847-862,  995-1014, 1085-1109
df = df.drop_duplicates() ## Hay cuatro duplicados en el dataset
## Eliminar datos que no nos interesan para nuestro entrenamiento
## df = df.drop(df.columns[[1, 2, 3]], axis = 1)


#df.iloc[803] == df.iloc[853]
#df.iloc[803].equals(df.iloc[853])

In [369]:
## 2.Data transformation: normalization, attribute selection, discretization, hierarchy generation

## Normalizacion de los datos dentro de un cierto intervalo: no es necesario al no tener campos con valores continuos
## Seleccionamos solo aquellas caracteristicas que nos interesen
## Eliminate columns -- 1-3 as they won't contribute to the model
## Eliminar columnas del 1 al 3 ya que no aportan informacion adiccional al modelo

df = df.drop(df.columns[[1, 2, 3]], axis = 1)
df.head(5)


Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...
2,truthful,This comes a little late as I'm finally catchi...
3,truthful,The Omni Chicago really delivers on all fronts...
4,truthful,I asked for a high floor away from the elevato...


In [370]:
## The deceptive and truthful registers are balanced: 800 - 800
## La clase deceptive esta balaneceada, tiene 800 registros deceptive y 800 registros truthful
print(len(df[df['deceptive'] == 'truthful']))
print(len(df[df['deceptive'] == 'deceptive']))

796
800


In [117]:
## 3.Data Reduction: Data Cube aggregation, attribute subset selection, numerosity reduction, dimensionality reduction

## Esta parte nos interesaria para modificar y convertir las opiniones de texto a un vectores de pesos(int) 

In [371]:
df['text'][1598]

"The Palmer House Hilton, while it looks good in pictures, and the outside, is actually a disaster of a hotel. When I went through, the lobby was dirty, my room hadn't been cleaned, and smelled thoroughly of smoke. When I requested more pillows, the lady on the phone scoffed at me and said she'd send them up. It took over an hour for 2 pillows. This hotel is a good example that what you pay for isn't always what you get. I will not be returning.\n"

In [372]:
## Limpiar texto, opinion del usuario para luego poder convertir la secuencia a vector

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.strip_short,
           #gsp.remove_stopwords, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

df['text'] = df['text'].apply(clean_text)

In [373]:
Counter(" ".join(df["text"]).split()).most_common(7)

[('the', 15965),
 ('and', 7905),
 ('wa', 5828),
 ('hotel', 3642),
 ('room', 3494),
 ('for', 2886),
 ('stai', 2261)]

In [374]:
from collections import Counter
for x in Counter(" ".join(df["text"]).split()).most_common(7):
    df['text'] = df['text'].str.replace(x[0], '')


In [None]:
## Es posible que no sea necesario quitar los stopwords porque convirtiria: I will not be returning(negativo). en --> return(positivo)

In [375]:
df['text'][1598]

' palmer hous hilton while look good pictur   outsid actual disast  when went through  lobbi  dirti  hadn been clean  smell thoroughli smoke when request more pillow  ladi  phone scof  said she send m took over hour  pillow thi  good exampl that what you pai  isn ali what you get will not return'

In [376]:
## Generate training and testing dataset
training_split = 0.75
testing_split = 0.25
pos_testing_split = 0.9 ## truthful + deceptive, los deceptive entre 5-10% de esta suma.
neg_deceptive_split = 0.1

deceptive_training = df[df.deceptive == 'truthful'].sample(frac = training_split)
newdf = df.drop(deceptive_training.index.values)
pos_deceptive_testing = newdf[newdf.deceptive == 'truthful'].sample(frac = pos_testing_split)
neg_deceptive_testing = newdf[newdf.deceptive == 'deceptive'].sample(int(neg_deceptive_split * (testing_split * len(df[df.deceptive == 'truthful']))))

frames = [pos_deceptive_testing, neg_deceptive_testing]
deceptive_testing = pd.concat(frames)
deceptive_testing = deceptive_testing.sample(frac = 1)

In [377]:
from gensim.utils import simple_preprocess
df['text'] = [simple_preprocess(line, deacc=True) for line in df['text']] 
print(df['text'].head(10))

0    [on, night, getai, with, famili, thursdai, tri...
1    [tripl, rate, with, upgrad, view, less, than, ...
2    [thi, come, littl, late, final, catch, review,...
3    [omni, chicago, realli, deliv, all, front, fro...
4    [ask, high, floor, ai, from, elev, that, what,...
5    [omni, on, night, follow, busi, meet, anoth, d...
6    [conrad, night, just, be, thanksgiv, had, corn...
7    [just, got, back, from, dai, chicago, shop, wi...
8    [arriv, omni, septemb, dai, took, ill, when, l...
9    [our, visit, chicago, chose, hyatt, due, it, l...
Name: text, dtype: object


In [378]:
## NLTK tokenizer

import nltk
import gensim
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2: ## solo las palabras con longitud > 2 se tokenizan
                continue
            tokens.append(word.lower())
    return tokens
train_dec = deceptive_training.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)
test_dec = deceptive_testing.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)

In [379]:
train_dec[958]

TaggedDocument(words=['fianc', 'travel', 'chicago', 'first', 'time', 'thi', 'decemb', 'were', 'not', 'happi', 'with', 'thi', 'friendliest', 'peopl', 'staff', 'were', 'bell', 'boi', 'greet', 'with', 'smile', 'gave', 'lot', 'great', 'advic', 'throughout', 'our', 'rest', 'staff', 'howev', 'terribl', 'when', 'first', 'check', 'instantli', 'discourag', 'front', 'desk', 'person', 'not', 'good', 'mood', 'couldn', 'it', 'leav', 'lobbi', 'have', 'not', 'travel', 'here', 'be', 'were', 'look', 'excit', 'place', 'eat', 'sightse', 'ask', 'multipl', 'staff', 'member', 'front', 'desk', 'place', 'see', 'never', 'got', 'ani', 'good', 'recommend', 'felt', 'were', 'inconveni', 'staff', 'on', 'point', 'front', 'desk', 'person', 'phone', 'while', 'were', 'it', 'talk', 'her', 'she', 'phone', 'famili', 'four', 'came', 'behind', 'it', 'talk', 'her', 'well', 'even', 'though', 'she', 'saw', 'st', 're', 'after', 'she', 'finish', 'with', 'phone', 'call', 'she', 'ignor', 'help', 'famili', 'first', 'ye', 'ar', 'our

In [380]:
dbow = Doc2Vec(dm=1, vector_size = 200, negative=5, window = 3, min_count = 2, sample = 0, workers = cores)
vec = dbow.infer_vector(df['text'][50])
print(df['text'][50])
print(len(vec))
print("Top 10 values in Doc2Vec inferred vector:")
print(vec[:10])

AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf'

In [381]:
## Build vocabulary using distributed bag of words
from tqdm import tqdm
import multiprocessing
cores = multiprocessing.cpu_count()
dbow = Doc2Vec(dm=1, vector_size = 200, negative=5, window = 3, min_count = 2, sample = 0, workers = cores)
dbow.build_vocab([x for x in tqdm(train_dec.values)])

100%|██████████| 597/597 [00:00<00:00, 1130984.41it/s]


In [382]:
%%time
from sklearn import utils
for epoch in range(30):
    dbow.train(utils.shuffle([x for x in tqdm(train_dec.values)]), total_examples=len(train_dec.values), epochs=1)
    dbow.alpha -= 0.002
    dbow.min_alpha = dbow.alpha

100%|██████████| 597/597 [00:00<00:00, 1369053.85it/s]
100%|██████████| 597/597 [00:00<00:00, 1313056.89it/s]
100%|██████████| 597/597 [00:00<00:00, 1802735.41it/s]
100%|██████████| 597/597 [00:00<00:00, 901367.71it/s]
100%|██████████| 597/597 [00:00<00:00, 1601022.69it/s]
100%|██████████| 597/597 [00:00<00:00, 1657180.34it/s]
100%|██████████| 597/597 [00:00<00:00, 1313745.80it/s]
100%|██████████| 597/597 [00:00<00:00, 1574842.45it/s]
100%|██████████| 597/597 [00:00<00:00, 1932098.37it/s]
100%|██████████| 597/597 [00:00<00:00, 1071000.64it/s]
100%|██████████| 597/597 [00:00<00:00, 1620711.64it/s]
100%|██████████| 597/597 [00:00<00:00, 1329086.78it/s]
100%|██████████| 597/597 [00:00<00:00, 830789.48it/s]
100%|██████████| 597/597 [00:00<00:00, 1132518.99it/s]
100%|██████████| 597/597 [00:00<00:00, 1521263.36it/s]
100%|██████████| 597/597 [00:00<00:00, 1590851.01it/s]
100%|██████████| 597/597 [00:00<00:00, 1815808.19it/s]
100%|██████████| 597/597 [00:00<00:00, 2127442.22it/s]
100%|███████

In [383]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [384]:
y_train, x_train = vec_for_learning(dbow, train_dec)
y_test, x_test = vec_for_learning(dbow, test_dec)

In [389]:
dbow.most_similar('eat')

[('where', 0.9833047986030579),
 ('ton', 0.9828348159790039),
 ('star', 0.9827905893325806),
 ('or', 0.981451153755188),
 ('thank', 0.9809910655021667),
 ('mean', 0.9799057841300964),
 ('expect', 0.979249119758606),
 ('workout', 0.9789624214172363),
 ('regardless', 0.9786323308944702),
 ('requir', 0.9778539538383484)]

In [158]:
len(x_train[597])

200

In [144]:
import numpy as np

training_list = []
for tag, text_val in zip(y_train, x_train):
    listToStr = ' '.join([str(elem) for elem in text_val]) 
    training_list.append([tag,listToStr])

testing_list = []
for tag, text_val in zip(y_test, x_test):
    listToStr = ' '.join([str(elem) for elem in text_val]) 
    testing_list.append([tag,listToStr])

deceptive_training = pd.DataFrame(data=training_list, columns=["deceptive","text"])
deceptive_testing = pd.DataFrame(data=testing_list, columns=["deceptive","text"])
type(deceptive_training['text'])

pandas.core.series.Series

In [145]:
deceptive_training.to_csv(r'/home/zan/Desktop/dl_autoencoder/docs/autoencoder/deceptive_opinion_autoencoder/deceptive_training.csv', index=False, sep=',')
deceptive_testing.to_csv(r'/home/zan/Desktop/dl_autoencoder/docs/autoencoder/deceptive_opinion_autoencoder/deceptive_testing.csv', index=False, sep=',')