In [2]:
import os,re, sys
from collections import Counter


#for dev only
import pandas as pd

sys.path.append(os.path.abspath('/Users/mq1/IU/csci_b551/bwcooley-mckquinn-a2/part3'))
import SeekTruth


In [3]:
train_path = '../deceptive.train.txt' 
test_path = '../deceptive.test.txt'
#read in data
train_data = SeekTruth.load_file(train_path)
test_data = SeekTruth.load_file(test_path)

In [4]:
#check dtypes
print('Train data type: ', type(train_data))
print('Test data type: ', type(test_data))

Train data type:  <class 'dict'>
Test data type:  <class 'dict'>


In [5]:
train_data['classes']

['truthful', 'deceptive']

In [6]:
# Check counts of labels
label_count = (Counter(train_data['labels']))

In [7]:
def word_in_dict(word_dict, word):
    '''Check if element in dictionary.
    '''
    if word in word_dict:
        word_dict[word] += 1
    else:
        word_dict[word] = 1
    return word_dict

In [58]:
def get_clean_text(words):
    clean_sentence = list()
    puncutation = ".,!;?'$)#(/" 
    for word in words:
        word = word.lower()
        word = word.strip(puncutation)
        if word.isalpha():
            clean_sentence.append(word)
    return clean_sentence

In [59]:
def get_vocab(train_data):
    '''Get vocabulary within trianing data. Assuming words are broken up by
    spaces. No punctuation, caps, and lemmas are taken into account.
    Input: 
        train_data(dict): training data where objects is a string.
    Return: 
        words_dcitionary(dict): words and their count of occurance in data.
        deceptive_words_dict(dict): words and their count of occurance in deceptive labeled data.
        truthful_words_dict(dict): words and their count of occurance in truthful labeled data.
    '''
    words_dictionary = dict()
    deceptive_words_dict = dict()
    truthful_words_dict = dict()
    for sentence in range(len(train_data['objects'])):
        words = train_data['objects'][sentence].split(' ')
        words = get_clean_text(words)
        for w in words: 
            
            #word occurance in dataset
            words_dictionary = word_in_dict(words_dictionary, w)
            #sentence labeled as deceptive
            if train_data['labels'][sentence] == 'deceptive':
                deceptive_words_dict = word_in_dict(deceptive_words_dict, w)
            #sentence labeled as truthful
            else:
                truthful_words_dict = word_in_dict(truthful_words_dict, w)
    class_dict = dict(Counter(train_data['labels']))
    return words_dictionary, truthful_words_dict, deceptive_words_dict, class_dict

In [68]:


vocab, truth_vocab, decep_vocab, class_dict = get_vocab(train_data)
print('Training data has: ',len(vocab), 'unique words')
print('Training data has: ',len(decep_vocab), 'deceptive unique words')
print('Training data has: ',len(truth_vocab), 'truthful unique words')


Training data has:  7532 unique words
Training data has:  5086 deceptive unique words
Training data has:  5552 truthful unique words


In [72]:
words_in_class = dict()
for word in vocab:
    counter = 0 
    if word in decep_vocab:
        counter += 1
    if word in truth_vocab:
        counter += 1
    words_in_class[word] = counter


In [73]:
def get_tfidf(class_dict, words_in_class):
    tfidf = dict()
    for word in class_dict:
        count = class_dict[word]
        tfidf[word] = words_in_class[word]/count
    return tfidf

In [74]:
tfidf_truthful = get_tfidf(truth_vocab, words_in_class)


In [75]:
tfidf_truthful

{'we': 0.0015174506828528073,
 'stayed': 0.00749063670411985,
 'for': 0.0017825311942959,
 'a': 0.0007945967421533572,
 'one': 0.00819672131147541,
 'night': 0.009569377990430622,
 'getaway': 0.15384615384615385,
 'with': 0.0030816640986132513,
 'family': 0.047619047619047616,
 'on': 0.00291970802919708,
 'thursday': 0.4,
 'triple': 1.0,
 'aaa': 0.6666666666666666,
 'rate': 0.02857142857142857,
 'of': 0.0017316017316017316,
 'was': 0.0010111223458038423,
 'steal': 0.5,
 'floor': 0.014184397163120567,
 'room': 0.0021008403361344537,
 'complete': 0.18181818181818182,
 'plasma': 0.2222222222222222,
 'tv': 0.0425531914893617,
 'bose': 0.5,
 'stereo': 1.0,
 'voss': 1.0,
 'and': 0.0006668889629876625,
 'evian': 0.5,
 'water': 0.024096385542168676,
 'gorgeous': 0.18181818181818182,
 'tub': 0.06060606060606061,
 'but': 0.0038910505836575876,
 'fine': 0.09523809523809523,
 'us': 0.007751937984496124,
 'concierge': 0.029850746268656716,
 'very': 0.0035026269702276708,
 'helpful': 0.0165289256198

In [62]:
def get_basic_prob(word, vocab):
    #get probability of occurance in trianing data
    #assuming word occurs in dictionary
    return vocab[word]/sum(vocab.values())

In [63]:
def get_bayes_components(train_data):
    vocab, truth_vocab, decep_vocab, class_dict = get_vocab(train_data)
    bayes_components = dict()
    for word in vocab:
        probs = list()
        p_of_word = get_basic_prob(word, vocab)
        probs.append(p_of_word)
        try:
            p_word_truth = get_basic_prob(word, truth_vocab)
            probs.append(p_word_truth)
            p_word_decep = get_basic_prob(word, decep_vocab)
            probs.append(p_word_decep)
        except: 
            if len(probs) == 1:
                #put in extremely low probability
                probs.append(.0000000000000001)
                probs.append(.0000000000000001)

            elif len(probs) == 2: 
                #put in extremely low probability
                probs.append(.0000000000000001)
        p_of_truth = get_basic_prob( 'truthful',class_dict)
        probs.append(p_of_truth)
        p_of_decept = get_basic_prob('deceptive',class_dict)
        probs.append(p_of_decept)
        bayes_components[word] = probs

    return bayes_components


In [64]:
train_data['classes']

['truthful', 'deceptive']

In [65]:
bayes_comp = get_bayes_components(train_data)

In [66]:
test = '06/04/05'
test.isalpha()

False

In [79]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /Users/mq1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [81]:
in_data = list()
for sw in stop_words:
    if sw in vocab:
        in_data.append(sw)

In [86]:
in_data.sort()

In [87]:
in_data

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'did',
 'do',
 'does',
 'doing',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'has',
 'have',
 'haven',
 'having',
 'he',
 'her',
 'here',
 'herself',
 'him',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'm',
 'me',
 'more',
 'most',
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'now',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 's',
 'same',
 'she',
 'should',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 'very',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 