In [1]:
import os.path
import pandas as pd
from pathlib import Path

def read_data(debug=False):
    '''Helper procedure to load dataset.
    
    Returns
    -------
    Four pandas dataframes:
        train_X, val_X, train_y, val_y
    '''
#     train = "/content/drive/My Drive/1003 project/train.csv"
#     val = "/content/drive/My Drive/1003 project/dev.csv"
    train = 'train.csv'
    val = 'dev.csv'
    
    if debug:
        print("inside read_data")
    
    try:
        train_df = pd.read_csv(train)
        val_df = pd.read_csv(val)
    except:
        try:
            if debug:
                print("try")
            data_folder = Path(os.path.dirname(__file__).replace('src', 'data'))
        except:
            if debug:
                print("except")
            data_folder = Path(os.path.abspath('').replace('src', 'data'))
        finally:
            if debug: 
                print("finally")
            train = data_folder / "train.csv"
            val = data_folder / "dev.csv"
            train_df = pd.read_csv(train)
            val_df = pd.read_csv(val)
    
    if debug:
        print("past try-except")
    
    X_col = ['ex_id', 'user_id', 'prod_id', 'rating', 'date', 'review']
    y_col = ['label']
    
    train_X = train_df.filter(X_col, axis='columns')
    val_X = val_df.filter(X_col, axis='columns')
    train_y = train_df.filter(y_col, axis='columns')
    val_y = val_df.filter(y_col, axis='columns')
    
    if debug:
        print("return")
    
    return train_X, val_X, train_y, val_y

In [2]:
# load the data into dataframes
train_X, val_X, train_y, val_y = read_data()

In [2]:
test_X = pd.read_csv('test_no_label.csv')

In [4]:
test_X.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,6,929,0,4.0,,2009-08-25,Let me start with a shout-out to everyone who ...
1,9,932,0,5.0,,2014-05-09,Stopped in for lunch today and couldn't believ...
2,14,937,0,4.0,,2014-10-15,"Tiny little place, but very good food. Pastits..."
3,22,945,0,5.0,,2014-04-10,Food was delicious and service was great. Good...
4,23,946,0,5.0,,2014-03-29,Awesome hole in the wall place to grab a quick...


In [5]:
# what is ex_id? why does it not match row id? can it be removed due to no significance?
train_X['ex_id'][35]

51

In [5]:
#Functions for cleaning HTML tags and punctuation 
import re
def clean_html(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def clean_punc(word):
    cleaned = re.sub(r'[?|!|\'|#]', r'', word)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned

In [6]:
#Getting Stopwords 
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
stop = stopwords.words('english') # list of stop words
sno = SnowballStemmer('english') # object with methods to get root words
print (stop)
print('***************************************')
print(sno.stem('tastful'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/w849277/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
final_string = []
s = ''
for sentence in train_X['review'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for word in sentence.split():
        for cleaned_word in clean_punc(word).split():
            if (cleaned_word.isalpha() and (len(cleaned_word) > 2) and cleaned_word not in stop):
                s = (sno.stem(cleaned_word.lower())).encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
                 
    strl = b' '.join(filtered_sentence)
    final_string.append(strl)

train_X['cleaned review'] = final_string

In [18]:
val_final_string = []
s = ''
for sentence in val_X['review'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for word in sentence.split():
        for cleaned_word in clean_punc(word).split():
            if (cleaned_word.isalpha() and (len(cleaned_word) > 2) and cleaned_word not in stop):
                s = (sno.stem(cleaned_word.lower())).encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
                 
    strl = b' '.join(filtered_sentence)
    val_final_string.append(strl)

val_X['cleaned review'] = val_final_string

In [7]:
test_final_string = []
s = ''
for sentence in test_X['review'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for word in sentence.split():
        for cleaned_word in clean_punc(word).split():
            if (cleaned_word.isalpha() and (len(cleaned_word) > 2) and cleaned_word not in stop):
                s = (sno.stem(cleaned_word.lower())).encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
                 
    strl = b' '.join(filtered_sentence)
    test_final_string.append(strl)

test_X['cleaned review'] = test_final_string

In [19]:
train_X.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,date,review,cleaned review
0,0,923,0,3.0,2014-12-08,The food at snack is a selection of popular Gr...,b'the food snack select popular greek dish the...
1,1,924,0,3.0,2013-05-16,This little place in Soho is wonderful. I had ...,b'this littl place soho wonder lamb sandwich g...
2,2,925,0,4.0,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,b'order lunch snack last friday time noth miss...
3,3,926,0,4.0,2011-07-28,This is a beautiful quaint little restaurant o...,b'this beauti quaint littl restaur pretti stre...
4,4,927,0,4.0,2010-11-01,Snack is great place for a casual sit down lu...,b'snack great place casual sit especi cold win...


In [8]:
test_X.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review,cleaned review
0,6,929,0,4.0,,2009-08-25,Let me start with a shout-out to everyone who ...,b'let start everyon boost mint lemonad that st...
1,9,932,0,5.0,,2014-05-09,Stopped in for lunch today and couldn't believ...,b'stop lunch today couldnt believ delici every...
2,14,937,0,4.0,,2014-10-15,"Tiny little place, but very good food. Pastits...",b'tini littl place good food pastitsio especi ...
3,22,945,0,5.0,,2014-04-10,Food was delicious and service was great. Good...,b'food delici servic great good atmospher quic...
4,23,946,0,5.0,,2014-03-29,Awesome hole in the wall place to grab a quick...,b'awesom hole wall place grab quick bite great...


In [23]:
val_X['cleaned review'][0]

b'around good place cozi came didnt huge appetit stuck appet friend combo platter full'

In [11]:
# save results
train_X.to_csv('train_X.csv', index=False)
train_y.to_csv('train_y.csv', index=False)
val_X.to_csv('val_X.csv', index=False)
val_y.to_csv('val_y.csv', index=False)

In [9]:
test_X.to_csv('test_X.csv', index=False)