In [1]:
import pandas as pd

In [2]:
train_path = '../../data/raw/train.csv'
val_path = '../../data/raw/dev.csv'

train = pd.read_csv(train_path)
validation = pd.read_csv(val_path)

base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'

### Balancing Dataset
**Methodology:**

1. Separate out the negative examples (dominant class)
2. Determine the number of dataframes (`num_splits`) needed to incorporate all negative examples.
3. Create a list of dataframes containing the different splits of negative examples.
4. Concat the positive and negative examples back together.
    - For each new training set, include a 80% random sample of the positive examples to aviod overfitting to the
    postive examples

In [3]:
# Setting frac = 1 to shuffle all the data
full_negative_examples = train[train['label']==0].sample(frac=1)

In [4]:
# Obtaining the number of positive and negative examples 
# to determine the number of splits  
positive_examples = train[train['label']==1]
num_pos_examples = positive_examples.count()[0]
num_neg_examples = full_negative_examples.count()[0]

num_splits = int(round(num_neg_examples / num_pos_examples))

In [5]:
neg_train_data = [full_negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_neg_examples)] for i in range(num_splits)]

In [6]:
training_sets = []
for i, negative_examples in enumerate(neg_train_data):
    train_set_fname = 'ac4119_train_set_{0}'.format(i)
    positive_examples = train[train['label']==1].sample(frac=.8)
    # Unioning the positive and negative examples 
    # Then shuffling so that not all negative examples are at the end
    train_set = pd.concat([negative_examples, positive_examples], ignore_index=True).sample(frac=1)
    training_sets.append(train_set)
    train_set.to_csv(base + train_set_fname, index=False, sep=',')

In [7]:
return_text = "There are {0} training sets.".format(len(training_sets))
print(return_text )

There are 9 training sets.


#### Cleaning Data

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from spellchecker import SpellChecker

In [9]:
import pickle

def load_obj(fname):
    directory = '../../data/processed/dev/'
    # This writes out a python object as a pickle.
    with open(directory + fname + '.pkl', 'rb') as f:
        return pickle.load(f)
    
# load_obj(word_freq , 'word_freq_ac4119')
word_freq_real = load_obj('word_freq_real_ac4119')
word_freq_fake = load_obj('word_freq_fake_ac4119')

sorted_freq_fake = {k: v for k, v in sorted(word_freq_fake.items(), key=lambda item: item[1], reverse=True)}
sorted_freq_real = {k: v for k, v in sorted(word_freq_real.items(), key=lambda item: item[1], reverse=True)}

top_fake_words = list(sorted_freq_fake.keys())[:100]
top_real_words = list(sorted_freq_real.keys())[:100]

top_overlapping_words = list(set(top_fake_words).intersection(set(top_real_words)))

In [10]:
stop_words = set(stopwords.words('english')) 

def rm_stop_words(tokens):
    filtered = [w for w in tokens if not w.lower() in stop_words] 
    return filtered

def rm_puctuation(tokens):
    init_filter = [w for w in tokens if w not in string.punctuation]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in init_filter]
    clean = [w for w in stripped if w not in ['',' ']]
    return clean

def clean_tokens(tokens):
    filtered = rm_stop_words(tokens)
    new_tokens = rm_puctuation(filtered)
    return new_tokens

def spellcheck_tokens(tokens):
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    cleaned_tokens = [spell.correction(word) if word in misspelled  else word for word in tokens ]
    return cleaned_tokens

def rm_overlap_words(tokens):
    filtered = [token for token in tokens if token not in top_overlapping_words]
    return filtered

def clean_tokens(tokens):
    stg_tokens_1 = rm_stop_words(tokens)
    stg_tokens_2 = rm_puctuation(stg_tokens_1)
    stg_tokens_3 = spellcheck_tokens(stg_tokens_2)
    new_tokens = rm_overlap_words(stg_tokens_3)
    return new_tokens

In [None]:
train_set['token_review'] = train_set['review'].apply(lambda x: clean_tokens(word_tokenize(x)))

In [11]:
import pickle

def load_obj(fname):
    directory = '../../data/processed/dev/'
    # This writes out a python object as a pickle.
    with open(directory + fname + '.pkl', 'rb') as f:
        return pickle.load(f)

In [12]:
print("Out of the top 100 words for both real and fake reviews {} overlapped".format(len(top_overlapping_words)))

Out of the top 100 words for both real and fake reviews 87 overlapped


In [16]:
%timeit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cnt_vectorizer = CountVectorizer( tokenizer=clean_tokens, binary=True, min_df=5)
tfidf_vectorizer = TfidfVectorizer( tokenizer=clean_tokens, binary=True, min_df=5)
cnt_vectorizer.fit(train_set['review'])
tfidf_vectorizer.fit(train_set['review'])

cnt_X_train = cnt_vectorizer.transform(train_set['review'])
tfidf_X_train = tfidf_vectorizer.transform(train_set['review'])

# cnt_X_dev = cnt_vectorizer.transform(validation['review'])
# tfidf_X_dev = tfidf_vectorizer.transform(validation['review'])

KeyboardInterrupt: 