In [59]:
## Load data from a csv file, pre-process the content of it and then generate the training and test dataset
## Cargar datos desde un fichero csv, preprocesar sus contenidos y generar el dataset de entreno y testing
import pandas as pd
from pandas import DataFrame

df = pd.read_csv('/home/zan/Downloads/deceptive-opinion.csv')

print(df)

      deceptive             hotel  polarity       source  \
0      truthful            conrad  positive  TripAdvisor   
1      truthful             hyatt  positive  TripAdvisor   
2      truthful             hyatt  positive  TripAdvisor   
3      truthful              omni  positive  TripAdvisor   
4      truthful             hyatt  positive  TripAdvisor   
...         ...               ...       ...          ...   
1595  deceptive  intercontinental  negative        MTurk   
1596  deceptive            amalfi  negative        MTurk   
1597  deceptive  intercontinental  negative        MTurk   
1598  deceptive            palmer  negative        MTurk   
1599  deceptive            amalfi  negative        MTurk   

                                                   text  
0     We stayed for a one night getaway with family ...  
1     Triple A rate with upgrade to view room was le...  
2     This comes a little late as I'm finally catchi...  
3     The Omni Chicago really delivers on all f

In [60]:
df.head(5)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [61]:
## Replace missing values & eliminate duplicated values/etc.
## Sustituir valores nulos/missing y eliminar valores duplicados 


In [62]:
## Eliminate columns -- 1-3 as they won't contribute to the model
## Eliminar columnas del 1 al 3 ya que no aportan informacion adiccional al modelo

df = df.drop(df.columns[[1, 2, 3]], axis = 1)
df.head(5)

Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...
2,truthful,This comes a little late as I'm finally catchi...
3,truthful,The Omni Chicago really delivers on all fronts...
4,truthful,I asked for a high floor away from the elevato...


In [63]:
## The deceptive and truthful registers are balanced: 800 - 800
## La clase deceptive esta balaneceada, tiene 800 registros deceptive y 800 registros truthful
print(len(df[df['deceptive'] == 'truthful']))
print(len(df[df['deceptive'] == 'deceptive']))

800
800


In [64]:
## Limpiar texto, opinion del usuario para luego poder convertir la secuencia a vector

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

df['text'] = df['text'].apply(clean_text)

In [65]:
## Generate training and testing dataset
training_split = 0.75
testing_split = 0.25
pos_testing_split = 0.9 ## truthful + deceptive, los deceptive entre 5-10% de esta suma.
neg_deceptive_split = 0.1

deceptive_training = df[df.deceptive == 'truthful'].sample(frac = training_split)
newdf = df.drop(deceptive_training.index.values)
pos_deceptive_testing = newdf[newdf.deceptive == 'truthful'].sample(frac = pos_testing_split)
neg_deceptive_testing = newdf[newdf.deceptive == 'deceptive'].sample(int(neg_deceptive_split * (testing_split * len(df[df.deceptive == 'truthful']))))

frames = [pos_deceptive_testing, neg_deceptive_testing]
deceptive_testing = pd.concat(frames)
deceptive_testing = deceptive_testing.sample(frac = 1)

In [80]:
## NLTK tokenizer

import nltk
import gensim
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2: ## solo las palabras con longitud > 2
                continue
            tokens.append(word.lower())
    return tokens
train_dec = deceptive_training.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)
test_dec = deceptive_testing.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)

In [84]:
train_dec[0]

TaggedDocument(words=['stai', 'night', 'getawai', 'famili', 'thursdai', 'tripl', 'aaa', 'rate', 'steal', 'floor', 'room', 'complet', 'plasma', 'bose', 'stereo', 'voss', 'evian', 'water', 'gorgeou', 'bathroom', 'tub', 'fine', 'concierg', 'help', 'beat', 'locat', 'flaw', 'breakfast', 'pricei', 'servic', 'slow', 'hour', 'kid', 'adult', 'fridai', 'morn', 'tabl', 'restaur', 'food', 'good', 'worth', 'wait', 'return', 'heartbeat', 'gem', 'chicago'], tags=['truthful'])

In [85]:
## Build vocabulary using distributed bag of words
import multiprocessing
cores = multiprocessing.cpu_count()
dbow = Doc2Vec(dm=0, vector_size = 200, negative=5, hs=0, min_count = 2, sample = 0, workers = cores)
dbow.build_vocab(train_dec)

In [90]:
dbow['room']

array([-1.8037278e-03,  2.1953136e-03,  1.1495170e-03,  8.8310236e-04,
       -1.6926166e-03,  1.6877520e-03, -1.5254492e-03,  2.4682661e-03,
        9.6701650e-04, -2.0535148e-03,  3.6703848e-05,  1.1426355e-03,
        1.7509202e-03,  1.7010007e-03,  2.3059329e-04, -2.2608542e-03,
       -1.5779366e-03,  2.2881601e-03,  3.9826182e-04, -1.6058704e-03,
       -4.0598362e-04, -3.5867936e-04,  8.4698251e-05, -1.3729396e-03,
       -6.2670338e-04, -1.0128714e-03, -4.8350001e-04, -4.5406350e-04,
       -2.1785991e-03,  3.6262529e-04,  1.7407745e-03, -2.8794174e-04,
       -7.6318672e-04,  2.2440574e-03, -2.0945820e-03,  8.0177031e-04,
        1.4793742e-03,  2.4855782e-03, -1.6187216e-03,  2.2508982e-03,
       -2.3572525e-04, -1.7201689e-03, -2.3671582e-03,  1.1281503e-04,
        1.2804486e-03,  1.3086082e-03, -4.3787647e-04,  2.3927134e-03,
       -2.0401319e-03, -9.1459433e-04, -7.4702065e-04,  1.8326370e-03,
        9.9530863e-04, -1.5859428e-03,  1.0500132e-04, -1.8105486e-03,
      

In [None]:
%%time
from sklearn import utils
for epoch in range(30):
    dbow.train(utils.shuffle([x for x in tqdm(train_dec.values)]), total_examples=len(train_dec.values), epochs=1)
    dbow.alpha -= 0.002
    dbow.min_alpha = dbow.alpha

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
y_train, x_train = vec_for_learning(dbow, train_dec)
y_test, x_test = vec_for_learning(dbow, test_dec)

In [None]:
import numpy as np

summ = y_train + x_train
summ
#x_train, y_train = np.asarray(x_train).ravel(), np.asarray(y_train).ravel()
#x_test, y_test = np.asarray(x_test), np.asarray(y_test)
#x_train, y_train = x_train.tolist(), y_train.tolist()
#deceptive_training = pd.DataFrame({'deceptive': y_train, 'text':x_train}, columns = ['deceptive','text'])
#deceptive_training

In [None]:
deceptive_training.to_csv(index=False)
deceptive_testing.to_csv(index=False)