In [1]:
import os.path
import pandas as pd
from pathlib import Path

def read_data(debug=False):
    '''Helper procedure to load dataset.
    
    Returns
    -------
    Four pandas dataframes:
        train_X, val_X, train_y, val_y
    '''
#     train = "/content/drive/My Drive/1003 project/train.csv"
#     val = "/content/drive/My Drive/1003 project/dev.csv"
    train = 'train.csv'
    val = 'dev.csv'
    
    if debug:
        print("inside read_data")
    
    try:
        train_df = pd.read_csv(train)
        val_df = pd.read_csv(val)
    except:
        try:
            if debug:
                print("try")
            data_folder = Path(os.path.dirname(__file__).replace('src', 'data'))
        except:
            if debug:
                print("except")
            data_folder = Path(os.path.abspath('').replace('src', 'data'))
        finally:
            if debug: 
                print("finally")
            train = data_folder / "train.csv"
            val = data_folder / "dev.csv"
            train_df = pd.read_csv(train)
            val_df = pd.read_csv(val)
    
    if debug:
        print("past try-except")
    
    X_col = ['ex_id', 'user_id', 'prod_id', 'rating', 'date', 'review']
    y_col = ['label']
    
    train_X = train_df.filter(X_col, axis='columns')
    val_X = val_df.filter(X_col, axis='columns')
    train_y = train_df.filter(y_col, axis='columns')
    val_y = val_df.filter(y_col, axis='columns')
    
    if debug:
        print("return")
    
    return train_X, val_X, train_y, val_y

In [2]:
# load the data into dataframes
train_X, val_X, train_y, val_y = read_data()

In [3]:
# what is ex_id? why does it not match row id? can it be removed due to no significance?
train_X['ex_id'][0]

0

In [4]:
#Functions for cleaning HTML tags and punctuation 
import re
def clean_html(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def clean_punc(word):
    cleaned = re.sub(r'[?|!|\'|#]', r'', word)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned

In [5]:
#Getting Stopwords 
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
stop = stopwords.words('english') # list of stop words
sno = SnowballStemmer('english') # object with methods to get root words
print (stop)
print('***************************************')
print(sno.stem('tastful'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isabelzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
final_string = []
s = ''
for sentence in train_X['review'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for word in sentence.split():
        for cleaned_word in clean_punc(word).split():
            if (cleaned_word.isalpha() and (len(cleaned_word) > 2) and cleaned_word not in stop):
                s = (sno.stem(cleaned_word.lower())).encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
                 
    strl = b' '.join(filtered_sentence)
    final_string.append(strl)

train_X['cleaned review'] = final_string

In [14]:
val_final_string = []
s = ''
for sentence in val_X['review'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for word in sentence.split():
        for cleaned_word in clean_punc(word).split():
            if (cleaned_word.isalpha() and (len(cleaned_word) > 2) and cleaned_word not in stop):
                s = (sno.stem(cleaned_word.lower())).encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
                 
    strl = b' '.join(filtered_sentence)
    val_final_string.append(strl)

val_X['cleaned review'] = val_final_string

In [15]:
# # save results
# train_X.to_csv('train_X.csv', index=False)
# train_y.to_csv('train_y.csv', index=False)
# val_X.to_csv('val_X.csv', index=False)
# val_y.to_csv('val_y.csv', index=False)