In [2]:
## Load data from a csv file, pre-process the content of it and then generate the training and test dataset
## Cargar datos desde un fichero csv, preprocesar sus contenidos y generar el dataset de entreno y testing
import pandas as pd
from pandas import DataFrame

df = pd.read_csv('../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive-opinion.csv')

print(df)

      deceptive             hotel  polarity       source  \
0      truthful            conrad  positive  TripAdvisor   
1      truthful             hyatt  positive  TripAdvisor   
2      truthful             hyatt  positive  TripAdvisor   
3      truthful              omni  positive  TripAdvisor   
4      truthful             hyatt  positive  TripAdvisor   
...         ...               ...       ...          ...   
1595  deceptive  intercontinental  negative        MTurk   
1596  deceptive            amalfi  negative        MTurk   
1597  deceptive  intercontinental  negative        MTurk   
1598  deceptive            palmer  negative        MTurk   
1599  deceptive            amalfi  negative        MTurk   

                                                   text  
0     We stayed for a one night getaway with family ...  
1     Triple A rate with upgrade to view room was le...  
2     This comes a little late as I'm finally catchi...  
3     The Omni Chicago really delivers on all f

In [35]:
df.head(5)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [36]:
## Replace missing values & eliminate duplicated values/etc.
## Sustituir valores nulos/missing y eliminar valores duplicados 
## 1.Data Cleaning: missing data, noisy data
missing_values = ["n/a", "na", "--"]
df.isnull().values.any()
df.isnull().sum() ## No hay valores de tipo missing
df[df.duplicated(keep=False)] ## 803-853, 847-862,  995-1014, 1085-1109
#df = df.drop_duplicates() ## Hay cuatro duplicados en el dataset
## Eliminar datos que no nos interesan para nuestro entrenamiento
## df = df.drop(df.columns[[1, 2, 3]], axis = 1)


#df.iloc[803] == df.iloc[853]
#df.iloc[803].equals(df.iloc[853])

Unnamed: 0,deceptive,hotel,polarity,source,text
803,truthful,omni,negative,Web,My daughter and I woke in the morning wanting ...
847,truthful,omni,negative,Web,The Omni was chosen for it's location whichwor...
853,truthful,omni,negative,Web,My daughter and I woke in the morning wanting ...
862,truthful,omni,negative,Web,The Omni was chosen for it's location whichwor...
995,truthful,affinia,negative,Web,"I'd been searching for a cool, non-chain hotel..."
1014,truthful,affinia,negative,Web,"I'd been searching for a cool, non-chain hotel..."
1085,truthful,monaco,negative,Web,Very disappointed in our stay in Chicago Monoc...
1109,truthful,monaco,negative,Web,Very disappointed in our stay in Chicago Monoc...


In [37]:
## 2.Data transformation: normalization, attribute selection, discretization, hierarchy generation

## Normalizacion de los datos dentro de un cierto intervalo: no es necesario al no tener campos con valores continuos
## Seleccionamos solo aquellas caracteristicas que nos interesen
## Eliminate columns -- 1-3 as they won't contribute to the model
## Eliminar columnas del 1 al 3 ya que no aportan informacion adiccional al modelo

df = df.drop(df.columns[[1, 2, 3]], axis = 1)
df.head(5)


Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...
2,truthful,This comes a little late as I'm finally catchi...
3,truthful,The Omni Chicago really delivers on all fronts...
4,truthful,I asked for a high floor away from the elevato...


In [38]:
## The deceptive and truthful registers are balanced: 800 - 800
## La clase deceptive esta balaneceada, tiene 800 registros deceptive y 800 registros truthful
print(len(df[df['deceptive'] == 'truthful']))
print(len(df[df['deceptive'] == 'deceptive']))

800
800


In [39]:
## 3.Data Reduction: Data Cube aggregation, attribute subset selection, numerosity reduction, dimensionality reduction

## Esta parte nos interesaria para modificar y convertir las opiniones de texto a un vectores de pesos(int) 

In [40]:
df['text'][239]

"The Hard Rock Hotel Chicago has become my favorite hotel. I've stayed there at least 5 times now and have never had anything other than a wonderful experience. As you might have guessed - it has a super Rock & Roll theme, with some music paraphanelia in the lobby and on each floor's elevator lobby. The rooms all have large photo murals that are themed to different musicians. You can request being on the floor of your favorite band (like KISS, Aerosmith, etc...) The rooms are GREAT. Well appointed. SUPER Comfortable beds and luxurious sheets and wonderous pillows. The large TV has a cool sound system that ramps up the viewing experience. There's a spacious desk in each room with a mini bar. I've always had a room with big windows and a view down Michigan Ave (the hotel sits right on Michigan - a block from the canal). The bathrooms are furnished with absolutely great fixtures sporting a great design... some of the showers have windows in side of them (you can shower AND enjoy the view 

In [41]:
#new_row = {'deceptive': 'truthful', 'text': 'This game action takes place in the medieval era where knights and kings fought along!'}
#df = df.append(new_row, ignore_index=True)
#df2 = pd.DataFrame(["truthful", "We liked this hotel a lot we thoroughly recommend it"], columns=["deceptive", "text"], index=[1600])
#df = df.append(df2)

## Limpiar texto

In [42]:
from collections import Counter
Counter(" ".join(df["text"]).split()).most_common(7)

[('the', 12772),
 ('and', 7735),
 ('to', 6671),
 ('a', 6312),
 ('I', 5941),
 ('was', 5777),
 ('in', 3587)]

In [55]:
from collections import Counter
from gensim.parsing.preprocessing import strip_multiple_whitespaces

for x in Counter(" ".join(df["text"]).split()).most_common(7):
    df['text'] = df['text'].str.replace(x[0], '')

In [56]:
for x in Counter(" ".join(df["text"]).split()).most_common(100):
    if len(x[0]) > 4:
        df['text'] = df['text'].str.replace(x[0], '')


In [122]:
## Otra forma de preprocesar los datos
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import string
from nltk.stem.porter import *

def text_cleaning(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    stemmer = PorterStemmer()
    text_split =[stemmer.stem(ch) for ch in text.split()]
    text = ' '.join(text_split)
    return text

df['text']=df['text'].apply(text_cleaning)

In [43]:
## Limpiar texto, opinion del usuario para luego poder convertir la secuencia a vector

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation, 
           gsp.strip_multiple_whitespaces, 
           gsp.strip_numeric, 
           gsp.strip_short,
           gsp.remove_stopwords, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

df['text'] = df['text'].apply(clean_text)

In [44]:
## Es posible que no sea necesario quitar los stopwords porque convirtiria: I will not be returning(negativo). en --> return(positivo)

In [45]:
df['text'][239]

'hard rock hotel chicago favorit hotel stai time wonder experi guess super rock roll theme music paraphanelia lobbi floor elev lobbi room larg photo mural theme differ musician request floor favorit band like kiss aerosmith room great appoint super comfort bed luxuri sheet wonder pillow larg cool sound ramp view experi spaciou desk room mini bar room big window view michigan av hotel sit right michigan block canal bathroom furnish absolut great fixtur sport great design shower window shower enjoi view michigan close waterproof drape great experi staff desk doorman great lobbi bar feel intim nice energi big screen catch latest score cours rock roll music pump place stai plain hotel great design fun memor hotel want return concern crazi charg park car overnight daili car park rate close night sorri go rate loop stai cool'

In [11]:
## Training - testing split for classifier
pos_testing_split = 0.90 ## truthful + deceptive, los deceptive entre 5-10% de esta suma.
neg_deceptive_split = 0.10

deceptive_training_pos = df[df.deceptive == 'truthful'].sample(frac = 0.75)
deceptive_training_neg = df[df.deceptive == 'deceptive'].sample(frac = 0.75)

frames = [deceptive_training_pos, deceptive_training_neg]
deceptive_training = pd.concat(frames)
deceptive_training = deceptive_training.sample(frac = 1)

newdf = df.drop(deceptive_training.index.values)

deceptive_testing_pos = newdf[newdf.deceptive == 'truthful'].sample(frac = pos_testing_split)
deceptive_testing_neg = newdf[newdf.deceptive == 'deceptive'].sample(frac = neg_deceptive_split)

frames = [deceptive_testing_pos, deceptive_testing_neg]
deceptive_testing = pd.concat(frames)
deceptive_testing = deceptive_testing.sample(frac = 1)


In [46]:
## Generate training and testing dataset
training_split = 0.75
testing_split = 0.25
pos_testing_split = 0.95 ## truthful + deceptive, los deceptive entre 5-10% de esta suma.
neg_deceptive_split = 0.05

deceptive_training = df[df.deceptive == 'truthful'].sample(frac = training_split)
newdf = df.drop(deceptive_training.index.values)
pos_deceptive_testing = newdf[newdf.deceptive == 'truthful'].sample(frac = pos_testing_split)
neg_deceptive_testing = newdf[newdf.deceptive == 'deceptive'].sample(int(neg_deceptive_split * (testing_split * len(df[df.deceptive == 'truthful']))))

frames = [pos_deceptive_testing, neg_deceptive_testing]
deceptive_testing = pd.concat(frames)
deceptive_testing = deceptive_testing.sample(frac = 1)

In [None]:
## Probar con todo el dataset, entrenar datos positivos y testing con datos balanceados 50-50 sinceros/fraudulentos

deceptive_training = df[df.deceptive == 'truthful'].sample(frac = 0.75)
#deceptive_training_neg = df[df.deceptive == 'deceptive'].sample(frac = 0.75)

newdf = df.drop(deceptive_training.index.values)

deceptive_testing_pos = newdf[newdf.deceptive == 'truthful'].sample(frac = 0.5)
deceptive_testing_neg = df[df.deceptive == 'deceptive'].sample(frac = 0.125)

frames = [deceptive_testing_pos, deceptive_testing_neg]
deceptive_testing = pd.concat(frames)
deceptive_testing = deceptive_testing.sample(frac = 1)


In [47]:
deceptive_training

Unnamed: 0,deceptive,text
323,truthful,wife spent night getawai excurs choos pricelin...
368,truthful,spent night amalfi hotel chicago busi book kin...
825,truthful,great awesom servic serious peopl amaz nice ba...
264,truthful,recent trip chicago attend major trade pleasur...
223,truthful,wife spent night talbott room spaciou comfort ...
...,...,...
1148,truthful,book hotel busi trip abl rate travelzoo figur ...
249,truthful,arriv sofitel joint leisur busi fantast hotel ...
352,truthful,stai octob novemb cconfer beauti hotel locat p...
1080,truthful,week long stai hilton south michigan attend me...


In [48]:
deceptive_training['text'][239]

'hard rock hotel chicago favorit hotel stai time wonder experi guess super rock roll theme music paraphanelia lobbi floor elev lobbi room larg photo mural theme differ musician request floor favorit band like kiss aerosmith room great appoint super comfort bed luxuri sheet wonder pillow larg cool sound ramp view experi spaciou desk room mini bar room big window view michigan av hotel sit right michigan block canal bathroom furnish absolut great fixtur sport great design shower window shower enjoi view michigan close waterproof drape great experi staff desk doorman great lobbi bar feel intim nice energi big screen catch latest score cours rock roll music pump place stai plain hotel great design fun memor hotel want return concern crazi charg park car overnight daili car park rate close night sorri go rate loop stai cool'

In [49]:
## NLTK tokenizer

import nltk
import gensim
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2: ## solo las palabras con longitud >= 2 se tokenizan
                continue
            tokens.append(word.lower())
    return tokens
train_dec = deceptive_training.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)
test_dec = deceptive_testing.apply(
    lambda x: TaggedDocument(words=tokenize_text(x['text']), tags=[x.deceptive]), axis=1)

In [50]:
print(train_dec[239])

TaggedDocument(['hard', 'rock', 'hotel', 'chicago', 'favorit', 'hotel', 'stai', 'time', 'wonder', 'experi', 'guess', 'super', 'rock', 'roll', 'theme', 'music', 'paraphanelia', 'lobbi', 'floor', 'elev', 'lobbi', 'room', 'larg', 'photo', 'mural', 'theme', 'differ', 'musician', 'request', 'floor', 'favorit', 'band', 'like', 'kiss', 'aerosmith', 'room', 'great', 'appoint', 'super', 'comfort', 'bed', 'luxuri', 'sheet', 'wonder', 'pillow', 'larg', 'cool', 'sound', 'ramp', 'view', 'experi', 'spaciou', 'desk', 'room', 'mini', 'bar', 'room', 'big', 'window', 'view', 'michigan', 'av', 'hotel', 'sit', 'right', 'michigan', 'block', 'canal', 'bathroom', 'furnish', 'absolut', 'great', 'fixtur', 'sport', 'great', 'design', 'shower', 'window', 'shower', 'enjoi', 'view', 'michigan', 'close', 'waterproof', 'drape', 'great', 'experi', 'staff', 'desk', 'doorman', 'great', 'lobbi', 'bar', 'feel', 'intim', 'nice', 'energi', 'big', 'screen', 'catch', 'latest', 'score', 'cours', 'rock', 'roll', 'music', 'pump

In [51]:
## Build vocabulary using distributed bag of words
from tqdm import tqdm
import multiprocessing
cores = multiprocessing.cpu_count()
dbow = Doc2Vec(dm=1, vector_size = 300, window = 5, min_count = 3, negative=5, workers = cores, alpha=0.025, min_alpha=0.001)
#dbow = Doc2Vec(vector_size = 1000, min_count = 0, alpha = 0.025, min_alpa = 0.025)
dbow.build_vocab([x for x in tqdm(train_dec.values)])

100%|██████████| 600/600 [00:00<00:00, 2529228.54it/s]


In [52]:
%%time
from sklearn import utils

dbow.train(utils.shuffle([x for x in tqdm(train_dec.values)]), total_examples=len(train_dec.values), epochs=200)


100%|██████████| 600/600 [00:00<00:00, 891772.64it/s]


CPU times: user 46.3 s, sys: 3.35 s, total: 49.7 s
Wall time: 21.9 s


In [53]:
def vec_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=50)) for doc in sents])
    return targets, feature_vectors

In [54]:
y_train, x_train = vec_for_learning(dbow, train_dec)
y_test, x_test = vec_for_learning(dbow, test_dec)

In [18]:
dbow.most_similar('night')

  dbow.most_similar('night')


[('afternoon', 0.2921980321407318),
 ('expedia', 0.2491123378276825),
 ('supposedli', 0.23475536704063416),
 ('wasnt', 0.23198136687278748),
 ('monei', 0.22694852948188782),
 ('meal', 0.22358886897563934),
 ('channel', 0.2217407524585724),
 ('lamp', 0.21949948370456696),
 ('com', 0.2192206233739853),
 ('central', 0.2185852825641632)]

In [19]:
len(x_train[260])

300

In [20]:
import numpy as np

training_list = []
for tag, text_val in zip(y_train, x_train):
    listToStr = ' '.join([str(elem) for elem in text_val]) 
    training_list.append([tag,listToStr])

testing_list = []
for tag, text_val in zip(y_test, x_test):
    listToStr = ' '.join([str(elem) for elem in text_val]) 
    testing_list.append([tag,listToStr])

deceptive_training = pd.DataFrame(data=training_list, columns=["deceptive","text"])
deceptive_testing = pd.DataFrame(data=testing_list, columns=["deceptive","text"])
type(deceptive_training['text'])

pandas.core.series.Series

In [None]:
deceptive_training.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_training.csv', index=False, sep=',')
deceptive_testing.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_testing.csv', index=False, sep=',')

## Files without/partially preprocessing

In [21]:
deceptive_training.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_training_sin_proc.csv', index=False, sep=',')
deceptive_testing.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_testing_sin_proc.csv', index=False, sep=',')

In [45]:
deceptive_training.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_training_sin_proc_v2.csv', index=False, sep=',')
deceptive_testing.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_testing_sin_proc_v2.csv', index=False, sep=',')
    

In [66]:
## Data for Classifier

df.to_csv(r'../../Deceptive_Opinion_Spam_Corpus_Datasets/deceptive_data_clasf.csv', index=False, sep=',')