In [1]:
import pandas as pd
import matplotlib as plt


In [52]:
df = pd.read_csv("data/car-reviews.csv")
df.head()
df["Sentiment"].unique()

array(['Neg', 'Pos'], dtype=object)

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrejwork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andrejwork/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def tokenize(text):
    from nltk.tokenize import word_tokenize
    return word_tokenize(text)  
    
def remove_stop_words(word_list):
    from nltk.corpus import stopwords     
    stop_words = set(stopwords.words('english'))   
    filtered = [w for w in word_list if not w in stop_words]  
    return filtered

def lower_case(word_list):
    return [w.lower() for w in word_list]

def stem_words(list_of_words):
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in list_of_words]
    return stemmed    

In [5]:
# Test clean up on one review
tokenized = tokenize(df["Review"][0])
clean = remove_stop_words(tokenized)
print(clean)
print()
lowered = lower_case(clean)
stemmed = stem_words(lowered)
print(stemmed)

['In', '1992', 'bought', 'new', 'Taurus', 'really', 'loved', 'So', '1999', 'decided', 'try', 'new', 'Taurus', 'I', 'care', 'style', 'newer', 'version', 'bought', 'anyway', 'I', 'like', 'new', 'car', 'half', 'much', 'liked', 'one', 'Thee', 'dash', 'much', 'deep', 'takes', 'lot', 'room', 'I', 'find', 'seats', 'comfortable', 'way', 'sides', 'stick', 'strip', 'protect', 'card', 'denting', 'It', 'drives', 'nice', 'good', 'pick', 'But', 'see', 'hood', 'driver', 'seat', 'judging', 'parking', 'difficult', 'It', 'small', 'gas', 'tank', 'I', 'would', 'buy', 'Taurus', 'I', 'I', 'would', 'rather', '1992', 'back', 'I', 'dont', 'think', 'style', 'nice', '1992', 'mistake', 'change', 'style', 'In', 'less', 'month', 'dead', 'battery', 'flat', 'tire']

['in', '1992', 'bought', 'new', 'tauru', 'realli', 'love', 'so', '1999', 'decid', 'tri', 'new', 'tauru', 'i', 'care', 'style', 'newer', 'version', 'bought', 'anyway', 'i', 'like', 'new', 'car', 'half', 'much', 'like', 'one', 'thee', 'dash', 'much', 'deep'

In [45]:
# Function that generates bag of words and turns list of
def generate_bow(list_tokenized_reviews):
    import numpy as np
    
    bow = {}
    for r in list_tokenized_reviews:
        for w in r:
            if w in bow:
                bow[w] += 1
            else:
                bow[w] = 1
    
    list_features = []
    for review in list_tokenized_reviews:
        vector = np.zeros(len(bow))
        for index, word in enumerate(bow):
            if word in review:
                vector[index] += 1
        list_features.append(vector)
    return list_features, bow

# Function that generates bag of words and turns list of
def generate_bow_faster(list_tokenized_reviews):
    # This is 6.5x faster than the previous version
    from collections import OrderedDict
    
    bow = {}
    for r in list_tokenized_reviews:
        for w in r:
            if w in bow:
                bow[w] += 1
            else:
                bow[w] = 1
    #print("BOW ",len(bow))
    bow_ordered = OrderedDict(sorted(bow.items(), key=lambda t: t[0]))
    inx = 0
    for key in bow_ordered:
        bow_ordered[key] = (inx, bow_ordered[key])
        inx += 1
    #print("BOW ORDERED ",len(bow_ordered))
    #print(bow_ordered)
    list_features = []
    for review in list_tokenized_reviews:
        vector = [0] * len(bow)
        for i in range(len(review)):
            inx = bow_ordered[review[i]][0]
            #print("INX ",inx)
            vector[inx] += 1
        list_features.append(vector)
    return list_features, bow_ordered

In [50]:
# test bag of words
test_list = [['drive', 'nice', 'good', 'pick'], ['mistak', 'chang', 'style', 'nice', 'good', 'pick']]
features, bow = generate_bow(test_list)
print(features)
print(bow)

[array([1., 1., 1., 1., 0., 0., 0.]), array([0., 1., 1., 1., 1., 1., 1.])]
{'drive': 1, 'nice': 2, 'good': 2, 'pick': 2, 'mistak': 1, 'chang': 1, 'style': 1}


In [51]:
# Convert the dataframe into a list of features
import time

t1 = time.time()
trans = [remove_stop_words, lower_case, stem_words]
features = []
for r in df["Review"].to_list():
    v = tokenize(r)
    c = remove_stop_words(v)
    l = lower_case(c)
    s = stem_words(l)
    features.append(s)

features = generate_bow_faster(features)   
t2 = time.time()
print("Done in {:0.2f}sec".format(t2-t1))
print(features)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

