In [1]:
import pandas as pd

In [2]:
train_path = '../../data/raw/train.csv'
val_path = '../../data/raw/dev.csv'

train = pd.read_csv(train_path)
validation = pd.read_csv(val_path)

base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'

In [3]:
validation.groupby(['label']).count()

Unnamed: 0_level_0,ex_id,user_id,prod_id,rating,date,review
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,32270,32270,32270,32270,32270,32270
1,3648,3648,3648,3648,3648,3648


In [6]:
(3648+32270)

35918

### Balancing Dataset
**Methodology:**

1. Separate out the negative examples (dominant class)
2. Determine the number of dataframes (`num_splits`) needed to incorporate all negative examples.
3. Create a list of dataframes containing the different splits of negative examples.
4. Concat the positive and negative examples back together.
    - For each new training set, include a 80% random sample of the positive examples to aviod overfitting to the
    postive examples

In [3]:
# Setting frac = 1 to shuffle all the data
full_negative_examples = train[train['label']==0].sample(frac=1)

In [4]:
# Obtaining the number of positive and negative examples 
# to determine the number of splits  
positive_examples = train[train['label']==1]
num_pos_examples = positive_examples.count()[0]
num_neg_examples = full_negative_examples.count()[0]

num_splits = int(round(num_neg_examples / num_pos_examples))

In [5]:
neg_train_data = [full_negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_neg_examples)] for i in range(num_splits)]

In [6]:
training_sets = []
for i, negative_examples in enumerate(neg_train_data):
    train_set_fname = 'ac4119_train_set_{0}'.format(i)
    positive_examples = train[train['label']==1].sample(frac=.8)
    # Unioning the positive and negative examples 
    # Then shuffling so that not all negative examples are at the end
    train_set = pd.concat([negative_examples, positive_examples], ignore_index=True).sample(frac=1)
    training_sets.append(train_set)
    #train_set.to_csv(base + train_set_fname, index=False, sep=',')

In [7]:
return_text = "There are {0} training sets.".format(len(training_sets))
print(return_text )

There are 9 training sets.


#### Cleaning Data

In [6]:
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 2.3 MB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-py3-none-any.whl size=12012547 sha256=f6b1a5323f058e31c0a5a7f96875c002d5ccf8cf9a09bb6771737cd5e5f796d4
  Stored in directory: /private/var/folders/r1/8cxq0ypj6gxcdyyxfb6yyjmj051lpw/T/pip-ephem-wheel-cache-2oe8668v/wheels/b5/94/56/596daa677d7e91038cbddfcf32b591d0c915a1b3a3e3d3c79d
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/chuamelia/anaco

In [3]:
import string
from spellchecker import SpellChecker
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [1]:
import pickle

def load_obj(fname):
    directory = '../../data/processed/dev/'
    # This writes out a python object as a pickle.
    with open(directory + fname + '.pkl', 'rb') as f:
        return pickle.load(f)
    
# load_obj(word_freq , 'word_freq_ac4119')
word_freq_real = load_obj('word_freq_real_ac4119')
word_freq_fake = load_obj('word_freq_fake_ac4119')

sorted_freq_fake = {k: v for k, v in sorted(word_freq_fake.items(), key=lambda item: item[1], reverse=True)}
sorted_freq_real = {k: v for k, v in sorted(word_freq_real.items(), key=lambda item: item[1], reverse=True)}

top_fake_words = list(sorted_freq_fake.keys())[:100]
top_real_words = list(sorted_freq_real.keys())[:100]

top_overlapping_words = list(set(top_fake_words).intersection(set(top_real_words)))

In [4]:
top_overlapping_words = ['', 'around', 'time', 'staff', 'wait', 'went', 'worth', 'made', 've', 'bit', 'pork', 'new', 'm', 'chicken', 'way', 'well', 'fresh', 'good', 'delicious', 'pretty', 'could', 'service', 'friendly', 'amazing', 'come', 'us', 'friend', 'like', 'night', 'go', 'place', 'also', 'bar', 'better', 'always', 'one', 'salad', 'make', 'got', 'food', 'great', 'long', 'never', 'say', 'dinner', 'try', 'two', 'know', 'nice', 'restaurant', 'definitely', 'order', 'people', 'would', 'eat', 'back', 'much', 'ever', 'really', 'sauce', 'table', 'even', 'menu', 'experience', 'ordered', 'perfect', 'want', 're', 'love', 'wine', 'cheese', 'came', 'meal', 'ca', 'nt', 's', 'right', 'small', 'everything', 'get', 'pizza', 'best', 'little', 'think', 'first', 'brunch', 'friends']

In [7]:
punctuations = string.punctuation

nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()
spell = SpellChecker()

def custom_parser(sentence):
    pre_token = nlp(sentence)
    tokens = [token.text for token in pre_token ]
    # removing punctuation

    # correcting spelling mistakes 
    misspelled = spell.unknown(tokens)
    stg_tokens_1 = [spell.correction(word) if word in misspelled  else word for word in tokens ]

    # removing stop words and top overlapping words
    stg_tokens_2 = [ word for word in stg_tokens_1 if word not in stop_words and word not in top_overlapping_words ]
    new_sentence = ' '.join(stg_tokens_2)

    # retaining lemma
    parser = English()
    stg_tokens_3 = parser(new_sentence)
    stg_tokens_4 = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in stg_tokens_3 ]

    # removing punctuation
    custom_tokens = [ word for word in stg_tokens_4 if word not in punctuations]

    return custom_tokens

In [17]:
example = "IF YOU ARE A SELF RESPECTING WOMAN WHO IS LOOKING TO EAT SOME DELICIOUS FOOD OR RESERVE A TABLE FOR A BIRTHDAY PARTY LOOK ELSEWHERE PLEASE!!!!!! I made a reservation for a birthday dinner several days before the actual day. They only offered me two different times that were way apart. And by this I mean the interval was ridiculous. Something like 6:45 or 11:30. I opted for the earlier one, even though my friends worked until 7 and could not get there on time. Upon arriving, I took a seat at the bar and was kindly offered a cocktail by the AMAZINGLY helpful and welcoming bartenders. They had taken my card and put it on file to reserve the table, and told me my table would be mine for two hours. While I waited for my friends to arrive at the bar, the MANAGER came up to me and told me I was gorgeous, but he would need to personally take in my dress (since it was a flowing, shirt dress with no belt) because my ""dress did not do justice"". When I exclaimed, ""EXCUSE ME?!"" he proceeded to add that I needed to show more leg and my figure because the dress was unflattering. HE CLEARLY HAD A RING ON HIS FINGER AND IS A MARRIED MAN. When my friend interjected and made him know we saw his ring, he simply shrugged and told her she was mad because he didn't flirt with her. :0 Once my friends arrived at about 7:30, I went up to the maitre d and informed him. He looked at me and told me he sat down others in MY TABLE, WHICH WAS RESERVED FOR 2 HOURS. He then continued to add that walk-ins take precedence, which completely baffles me because, JEEZ what is then the point of making a reservation?! IT IS MEANT TO RESERVE A SPOT FOR AN ALLOTTED PERIOD OF TIME. Unfortunately these two massively uncomfortable instances were enough to ruin my perception of the restaurant. The drinks were delicious. The food was as well. But POOR POOR POOR POOR POOR and PUTRID SERVICE. I AM DISGUSTED. The owner would do well in firing their entire management staff. Like I said the food was good, but the quantity was a joke, as well as the pricing for such menial meals. The most surprising thing out of all of this was after I expressed my annoyance I was not even offered an apology. Simple. I will never return again. If you appreciate honesty at its best, you'd believe this review for all its worth, because I give credit where it's due."

pre_token = nlp(example)
tokens = [token.text for token in pre_token ]

# correcting spelling mistakes 
misspelled = spell.unknown(tokens)
stg_tokens_1 = [spell.correction(word) if word in misspelled  else word for word in tokens ]

# removing stop words and top overlapping words
stg_tokens_2 = [ word for word in stg_tokens_1 if word.lower() not in stop_words and word.lower() not in top_overlapping_words ]
new_sentence = ' '.join(stg_tokens_2)

# retaining lemma
parser = English()
stg_tokens_3 = parser(new_sentence)
stg_tokens_4 = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in stg_tokens_3 ]

# removing punctuation
custom_tokens = [ word for word in stg_tokens_4 if word not in punctuations]


In [18]:
print(stg_tokens_2)

['SELF', 'RESPECTING', 'WOMAN', 'LOOKING', 'RESERVE', 'BIRTHDAY', 'PARTY', 'LOOK', '!', '!', '!', '!', '!', '!', 'reservation', 'birthday', 'days', 'actual', 'day', '.', 'offered', 'different', 'times', 'apart', '.', 'mean', 'interval', 'ridiculous', '.', 'p45', '11:30', '.', 'opted', 'earlier', ',', 'worked', '7', '.', 'arriving', ',', 'took', 'seat', 'kindly', 'offered', 'cocktail', 'AMAZINGLY', 'helpful', 'welcoming', 'bartenders', '.', 'taken', 'card', 'file', 'reserve', ',', 'told', 'hours', '.', 'waited', 'arrive', ',', 'MANAGER', 'told', 'gorgeous', ',', 'need', 'personally', 'dress', '(', 'flowing', ',', 'shirt', 'dress', 'belt', ')', 'dress', 'justice', '.', 'exclaimed', ',', 'EXCUSE', '?', '!', 'proceeded', 'add', 'needed', 'leg', 'figure', 'dress', 'unflattering', '.', 'CLEARLY', 'RING', 'FINGER', 'MARRIED', 'MAN', '.', 'interjected', 'saw', 'ring', ',', 'simply', 'shrugged', 'told', 'mad', 'flirt', '.', 'm0', 'arrived', 'ac30', ',', 'maitre', 'd', 'informed', '.', 'looked',

In [17]:
%timeit clean_tokens(word_tokenize(train_set['review'][1]))

4.69 s ± 141 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%timeit clean_tokens(word_tokenize(train_set['review'][1]))

4.49 s ± 74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%timeit clean_tokens(word_tokenize(train_set['review'][1]))

4.56 s ± 54.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
train_set['token_review'] = train_set['review'].map(lambda x: clean_tokens(word_tokenize(x)))

In [None]:
import pickle

def load_obj(fname):
    directory = '../../data/processed/dev/'
    # This writes out a python object as a pickle.
    with open(directory + fname + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
print("Out of the top 100 words for both real and fake reviews {} overlapped".format(len(top_overlapping_words)))

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cnt_vectorizer = CountVectorizer( tokenizer=clean_tokens, binary=True, min_df=5)
tfidf_vectorizer = TfidfVectorizer( tokenizer=clean_tokens, binary=True, min_df=5)
cnt_vectorizer.fit(train_set['review'])
tfidf_vectorizer.fit(train_set['review'])

cnt_X_train = cnt_vectorizer.transform(train_set['review'])
tfidf_X_train = tfidf_vectorizer.transform(train_set['review'])

# cnt_X_dev = cnt_vectorizer.transform(validation['review'])
# tfidf_X_dev = tfidf_vectorizer.transform(validation['review'])

In [19]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, roc_curve

In [21]:
from sklearn.linear_model import LogisticRegression
params = {'solver':'liblinear', 'max_iter':1000, 'class_weight': 'balanced', 'random_state': 519}
tfidf_lr = LogisticRegression(**params )
# tfidf_lr = LogisticRegression()
fitted_tfidf_lr = tfidf_lr.fit(tfidf_X_train, Y_train)
# fitted_model = 

NameError: name 'tfidf_X_train' is not defined

In [None]:
fitted_model.score(X_train, Y_train)