## Preprocessing Steps ##

#### Data Cleaning
* Remove punctuations from `review_text`
* Lowercase all text 

#### Match review text with location
* For each piece of review text in `yelp_review_cleaned.csv`, we extract `business_id` to match it with `city` and `state` from the business data 
* Costruct a new csv file with columns `business_id`, `text`, `city`, `state`
* Group cities/states into more general regions (e.g. Southwest, Northeast)
* Add a new column to our csv file: `region`

In [10]:
import pandas as pd
import numpy as np
import string
import csv

In [11]:
def clean_text(text):
    # remove punctuations
    clean_text = [char for char in text if char not in string.punctuation]
    clean_text = ''.join(clean_text)
    # lower case all text
    clean_text = clean_text.lower()
    
    return clean_text

In [12]:
chunk_size = 200000
raw_data_path = 'yelp_academic_dataset_review.csv'
clean_data_path = 'yelp_review_cleaned.csv'
column_names = ['business_id', 'text']

def read_csv(ds_name, chunk_size):
    
    for chunk in pd.read_csv(ds_name, chunksize=chunk_size):
        return chunk

review_text = read_csv(raw_data_path, chunk_size)

with open(clean_data_path, 'w') as csv_file:
    csv_file = csv.writer(csv_file)
    csv_file.writerow(column_names)
    
    for idx in range(chunk_size):
        cleaned_text = clean_text(review_text['text'][idx])
        business_id = review_text['business_id'][idx].split("'")[1]
        csv_file.writerow([business_id, cleaned_text])

In [37]:
business_data_path = "yelp_academic_dataset_business.csv"
clean_data_path = 'yelp_review_cleaned.csv'

clean_text = pd.read_csv(clean_data_path)
business_info = pd.read_csv(business_data_path)
clean_text.head()

Unnamed: 0,business_id,text
0,buF9druCkbuXLX526sGELQ,bapparently prides osteria had a rough summer ...
1,RA4V8pr014UyUbDvI-LW2A,bthis store is pretty good not as great as wal...
2,_sS2LBIGNT5NQb6PD1Vtjw,bi called wvm on the recommendation of a coupl...
3,0AzLzHfOJgL7ROwhdww2ew,bive stayed at many marriott and renaissance m...
4,8zehGz9jnxPqXtOc7KaJxA,bthe food is always great here the service fro...


In [39]:
from itertools import *

ensemble_data_path = 'ensemble.csv'
column_names = ["city", "state", "text"]

text_business_ids = np.array(clean_text['business_id'])
all_business_ids = np.array(business_info['business_id'])
all_cities = np.array(business_info['city'])
all_states = np.array(business_info['state'])

# write a new csv file with column names: city, state, text
with open(ensemble_data_path, 'w') as csv_file:
    csv_file = csv.writer(csv_file)
    csv_file.writerow(column_names)
    
    for i in islice(count(), 0, len(text_business_ids)-1):
        item_loc = np.where(all_business_ids == text_business_ids[i])[0][0]
        city = all_cities[item_loc]
        state = all_states[item_loc]
        review_text = clean_text['text'][item_loc]
        csv_file.writerow([city, state, review_text])