In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [2]:
args = Namespace(
    raw_train_dataset_csv="D:/data/yelp/yelp_review_polarity_csv/train.csv",
    raw_test_dataset_csv="D:/data/yelp/yelp_review_polarity_csv/test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="D:/data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

In [3]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [4]:
# making the subset equal across the review classes
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [5]:
train_reviews.rating.value_counts()

1    280000
2    280000
Name: rating, dtype: int64

In [6]:
review_subset.rating.value_counts()

1    28000
2    28000
Name: rating, dtype: int64

In [7]:
# Unique classes
set(review_subset.rating)

{1, 2}

In [8]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
    
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_valid = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
     # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
        
    for item in item_list[n_train:n_train + n_valid]:
        item['split'] = 'valid'
        
    for item in item_list[n_train + n_valid:n_train + n_valid + n_test]:
        item['split'] = 'test'
        
    # Add to final list
    final_list.extend(item_list)

In [9]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)

In [10]:
final_reviews.split.value_counts()

train    39200
valid     8400
test      8400
Name: split, dtype: int64

In [11]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [12]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,terrible place to work for i just heard a stor...,train
1,1,"hours , minutes total time for an extremely s...",train
2,1,my less than stellar review is for service . w...,train
3,1,i m granting one star because there s no way t...,train
4,1,the food here is mediocre at best . i went aft...,train


In [13]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [14]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [15]:
final_reviews.to_csv(args.output_munged_csv, index=False)

In [19]:
final_reviews.iloc[23279]

rating                                             negative
review    am captive in corporate cafeteria they cater t...
split                                                 valid
Name: 23279, dtype: object