In [3]:
import pandas as pd

In [4]:
train_path = '../../data/raw/train.csv'
val_path = '../../data/raw/dev.csv'

train = pd.read_csv(train_path)
validation = pd.read_csv(val_path)

base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'

### Balancing Dataset
**Methodology:**

1. Separate out the negative examples (dominant class)
2. Determine the number of dataframes (`num_splits`) needed to incorporate all negative examples.
3. Create a list of dataframes containing the different splits of negative examples.
4. Concat the positive and negative examples back together.
    - For each new training set, include a 80% random sample of the positive examples to aviod overfitting to the
    postive examples

In [6]:
# Setting frac = 1 to shuffle all the data
full_negative_examples = train[train['label']==0].sample(frac=1)

In [9]:
# Obtaining the number of positive and negative examples 
# to determine the number of splits  
positive_examples = train[train['label']==1]
num_pos_examples = positive_examples.count()[0]
num_neg_examples = full_negative_examples.count()[0]

num_splits = int(round(num_neg_examples / num_pos_examples))

In [10]:
neg_train_data = [full_negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_neg_examples)] for i in range(num_splits)]

In [11]:
training_sets = []
for i, negative_examples in enumerate(neg_train_data):
    train_set_fname = 'ac4119_train_set_{0}'.format(i)
    positive_examples = train[train['label']==1].sample(frac=.8)
    # Unioning the positive and negative examples 
    # Then shuffling so that not all negative examples are at the end
    train_set = pd.concat([negative_examples, positive_examples], ignore_index=True).sample(frac=1)
    training_sets.append(train_set)
    train_set.to_csv(base + train_set_fname, index=False, sep=',')

In [13]:
return_text = "There are {0} training sets.".format(len(training_sets))
print(return_text )

There are 9 training sets.


Cleaning Data

In [24]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from spellchecker import SpellChecker

In [31]:
stop_words = set(stopwords.words('english')) 

def rm_stop_words(tokens):
    filtered = [w for w in tokens if not w.lower() in stop_words] 
    return filtered

def rm_puctuation(tokens):
    init_filter = [w for w in tokens if w not in string.punctuation]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in init_filter]
    clean = [w for w in stripped if w not in ['',' ']]
    return clean

def filter_tokens(tokens):
    filtered = rm_stop_words(tokens)
    new_tokens = rm_puctuation(filtered)
    return new_tokens

def spellcheck_tokens(tokens):
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    cleaned_tokens = [spell.correction(word) if word in misspelled  else word for word in tokens ]
    return cleaned_tokens

In [32]:
# Sample text used to benchmark both Spell Checkers.
sample_text = """Typing this place into Yelp's search bar, I accidenlly typed "Brooklyn Crap" and thought to myself, how appropos.     It's a shame really. Brooklyn Crap has an amazing location and is really, really fun. Mini golf and other games and time wasters on the first floor really do make for a good experience. The problem is the food stinks, it's way overpriced and the wait staff seem like total novices.  The wait times are not only atrocious, but inaccurate.     It's such a shame - this place could've been one of the greats. Instead, it's one of the worst."""
print(sample_text)

Typing this place into Yelp's search bar, I accidenlly typed "Brooklyn Crap" and thought to myself, how appropos.     It's a shame really. Brooklyn Crap has an amazing location and is really, really fun. Mini golf and other games and time wasters on the first floor really do make for a good experience. The problem is the food stinks, it's way overpriced and the wait staff seem like total novices.  The wait times are not only atrocious, but inaccurate.     It's such a shame - this place could've been one of the greats. Instead, it's one of the worst.


In [33]:
tokens = word_tokenize(sample_text)
filtered_tokens = filter_tokens(tokens)
print(spellcheck_tokens(filtered_tokens))

['Typing', 'place', 'Yelp', 's', 'search', 'bar', 'accidently', 'typed', 'Brooklyn', 'Crap', 'thought', 'apropos', 's', 'shame', 'really', 'Brooklyn', 'Crap', 'amazing', 'location', 'really', 'really', 'fun', 'Mini', 'golf', 'games', 'time', 'wasters', 'first', 'floor', 'really', 'make', 'good', 'experience', 'problem', 'food', 'stinks', 's', 'way', 'overpriced', 'wait', 'staff', 'seem', 'like', 'total', 'novices', 'wait', 'times', 'atrocious', 'inaccurate', 's', 'shame', 'place', 'could', 've', 'one', 'greats', 'Instead', 's', 'one', 'worst']
