In [1]:
import pandas as pd
import nltk

## Load the CSV

In [2]:
def load_data(path):
    '''
    Given a path (str) to a csv returns a pandas dataframe
    '''
    df = pd.read_csv(path)
    return df

In [3]:
df = load_data("./Musical_instruments_reviews.csv") # load it
df.head() # visually check the dataframe looks like the csv

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [4]:
# verify the summary column is present
assert "summary" in list(df.columns)
assert df["summary"].dtype ==object # pandas doesnt tell us if its a string or not
for i in range(df.shape[0]):
    assert isinstance(df.iloc[i]["summary"],str)

In [5]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


## Perform Tokenization

In [6]:
def token(colName,df):
    '''
    Given a column name (string) and pandas data frame
    Returns a series with the column tokenized
    '''
    nltk.download('punkt')
    col = df[colName] # returns a series
    col_token = col.apply(nltk.word_tokenize)
    return col_token

In [7]:
summary_token = token("summary",df) # tokenize the column
summary_token.head(10) # visually check the data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0                                               [good]
1                                               [Jake]
2                           [It, Does, The, Job, Well]
3                  [GOOD, WINDSCREEN, FOR, THE, MONEY]
4     [No, more, pops, when, I, record, my, vocals, .]
5                                   [The, Best, Cable]
6    [Monster, Standard, 100, -, 21, ', Instrument,...
7        [Did, n't, fit, my, 1996, Fender, Strat, ...]
8                                       [Great, cable]
9          [Best, Instrument, Cables, On, The, Market]
Name: summary, dtype: object

In [8]:
# verify the column was transformed
summary_col = df["summary"]
assert summary_token[0] != summary_col[0] # verify theres a change
assert summary_token[20] != summary_col[20] # verify the cols differ
assert summary_token[3] == summary_col[3].split() # verify the tokenization performed as expected
for i in range(summary_token.size):
    assert isinstance(summary_token[i],list)

## Perform Stemming


In [9]:
def stemming_col(token_col):
    '''
    Given a tokenized series, returns a series that is stemmed
    '''
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    stem_list = []
    
    # stem each token/word
    for token in token_col:
        row = []
        for t in token:
            row.append(stemmer.stem(t))
        stem_list.append(row)
    col_stem = pd.Series(stem_list)
    return col_stem

In [10]:
summary_stem = stemming_col(summary_token)
summary_stem.head(10) # visual the data

0                                               [good]
1                                               [jake]
2                            [it, doe, the, job, well]
3                  [good, windscreen, for, the, money]
4       [no, more, pop, when, i, record, my, vocal, .]
5                                    [the, best, cabl]
6    [monster, standard, 100, -, 21, ', instrument,...
7        [did, n't, fit, my, 1996, fender, strat, ...]
8                                        [great, cabl]
9            [best, instrument, cabl, on, the, market]
dtype: object

In [11]:
# verify the data values
assert summary_stem[9] == ['best', 'instrument', 'cabl', 'on', 'the', 'market'] # verify cables changes to cabl
assert len(summary_stem[4]) == len(summary_token[4]) # verify words dont get lost

## Perform Lemmatization

In [12]:
def lemmatize_col(token_col):
    '''
    Given a tokenized series, returns a series
    with lemmatization performed
    '''
    nltk.download('wordnet')
    lem = nltk.stem.WordNetLemmatizer()
    lem_list =[]
    # lemmatize each token/word
    for token in token_col:
        row = []
        for t in token:
            row.append(lem.lemmatize(t))
        lem_list.append(row)
    col_lem = pd.Series(lem_list)
    return col_lem

In [13]:
summary_lem = lemmatize_col(summary_token)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
summary_lem.head() # visually check the data

0                                            [good]
1                                            [Jake]
2                        [It, Does, The, Job, Well]
3               [GOOD, WINDSCREEN, FOR, THE, MONEY]
4    [No, more, pop, when, I, record, my, vocal, .]
dtype: object

In [15]:
# verify the data
assert summary_lem[4] ==['No', 'more', 'pop', 'when', 'I', 'record', 'my', 'vocal', '.'] # the pops changes to pop
assert len(summary_lem[4]) == len(summary_token[4]) # verify words dont get lost

## Part of Speech Tagging

In [16]:
def pst(token_col):
    '''
    Given a tokenized series,
    returns a series with part of speech tagging
    '''
    nltk.download('averaged_perceptron_tagger')
    pst_list =[]
    # pst each token/word
    for token in token_col:
        row = []
        pst_list.append(nltk.pos_tag(token))
    col_pst = pd.Series(pst_list)
    return col_pst

In [17]:
summary_pst = pst(summary_token)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [18]:
summary_pst.head(10) #visualize the data

0                                         [(good, JJ)]
1                                         [(Jake, NN)]
2    [(It, PRP), (Does, VBZ), (The, DT), (Job, NNP)...
3    [(GOOD, JJ), (WINDSCREEN, NNP), (FOR, NNP), (T...
4    [(No, DT), (more, RBR), (pops, NNS), (when, WR...
5               [(The, DT), (Best, NNP), (Cable, NNP)]
6    [(Monster, NNP), (Standard, NNP), (100, CD), (...
7    [(Did, NNP), (n't, RB), (fit, VB), (my, PRP$),...
8                          [(Great, NNP), (cable, NN)]
9    [(Best, RBS), (Instrument, NNP), (Cables, NNP)...
dtype: object

## Join All Transforms into DF

In [19]:
def add_cols(df, cols, names):
    '''
    Given a pandas data frame (df), list of series (cols)
    and a list of column names (names)
    Updates the data frame to have all the new columns from the series
    '''
    for i in range(len(cols)):
        df[names[i]] = cols[i]

In [20]:
series = [summary_token, summary_stem, summary_lem, summary_pst]
names = ["summary_tokens", "summary_stemming", "summary_lemmatization", "summary_pst"]
add_cols(df, series, names)
df.head() #visualize the data

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,summary_tokens,summary_stemming,summary_lemmatization,summary_pst
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014",[good],[good],[good],"[(good, JJ)]"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013",[Jake],[jake],[Jake],"[(Jake, NN)]"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013","[It, Does, The, Job, Well]","[it, doe, the, job, well]","[It, Does, The, Job, Well]","[(It, PRP), (Does, VBZ), (The, DT), (Job, NNP)..."
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014","[GOOD, WINDSCREEN, FOR, THE, MONEY]","[good, windscreen, for, the, money]","[GOOD, WINDSCREEN, FOR, THE, MONEY]","[(GOOD, JJ), (WINDSCREEN, NNP), (FOR, NNP), (T..."
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014","[No, more, pops, when, I, record, my, vocals, .]","[no, more, pop, when, i, record, my, vocal, .]","[No, more, pop, when, I, record, my, vocal, .]","[(No, DT), (more, RBR), (pops, NNS), (when, WR..."
