In [2]:
import pandas as pd
import numpy as np
import string

In [3]:
# Load the dataset from csv to pandas
dataDir = '/Users/sachinjain/personal/hdrive/data/courses/coursera-ml-foundations/amazon_product_reviews/'
dataPath = dataDir + 'amazon_baby.csv'
trainDataIndexFile = dataDir + 'train-idx.json'
testDataIndexFile = dataDir + 'test-idx.json'


products = pd.read_csv(dataPath)

products.count()

name      183213
review    182702
rating    183531
dtype: int64

In [4]:
def remove_punctuation(text):
    # This uses the 3-argument version of str.maketrans
    # with arguments (x, y, z) where 'x' and 'y'
    # must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z'
    # is a string (string.punctuation here)
    # where each character in the string is mapped
    # to None
    translator = str.maketrans('', '', string.punctuation)
    
    if type(text) is str:
        return text.translate(translator)
    else:
        return text

In [5]:
remove_punctuation(2)
remove_punctuation('sachin.jain@gmail.com')

'sachinjaingmailcom'

In [6]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [7]:
# Add a new column to products data by removing punctuation in the reviews
products['review_clean'] = products['review'].apply(remove_punctuation)

### Fill all N/A values with empty strings

In [37]:
products = products.fillna({'review_clean':''})
products = products.fillna({'review':''})

### We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.

In [9]:
products = products[products['rating'] != 3]

In [10]:
products.count()

name            166456
review          166752
rating          166752
review_clean    165975
dtype: int64

In [11]:
products.head()

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...


### Convert 4,5 rating to +1 label and 1,2 rating to -1 label

In [12]:
products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating > 3 else -1)

In [13]:
products.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


### Split the data into train_data and test_data

In [26]:
def get_indices(filePath):
    with open(filePath, 'r') as fp:
        indices_as_strings = fp.read()[1:-1].split(', ')

    indices_as_numbers = [ int(x) for x in indices_as_strings ]
    
    return indices_as_numbers

In [27]:
train_indices = get_indices(trainDataIndexFile)
test_indices = get_indices(testDataIndexFile)

In [38]:
train_data = products.take(train_indices, is_copy=True)
test_data = products.take(test_indices, is_copy=True)

In [39]:
print(products.info())
print(train_data.info())
print(test_data.info())

train_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 166752 entries, 1 to 183530
Data columns (total 5 columns):
name            166456 non-null object
review          166752 non-null object
rating          166752 non-null int64
review_clean    166752 non-null object
sentiment       166752 non-null int64
dtypes: int64(2), object(3)
memory usage: 7.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 133416 entries, 1 to 183529
Data columns (total 5 columns):
name            133174 non-null object
review          133416 non-null object
rating          133416 non-null int64
review_clean    133416 non-null object
sentiment       133416 non-null int64
dtypes: int64(2), object(3)
memory usage: 6.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33336 entries, 9 to 183530
Data columns (total 5 columns):
name            33282 non-null object
review          33336 non-null object
rating          33336 non-null int64
review_clean    33336 non-null object
sentiment       33336 no

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


### Build sparse matrix of word count vectors from train data and test data

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

### 7. Learn a logistic regression classifier using the training data. If you are using scikit-learn, you should create an instance of the LogisticRegression class and then call the method fit() to train the classifier. This model should use the sparse word count matrix (train_matrix) as features and the column sentiment of train_data as the target. Use the default values for other parameters. Call this model sentiment_model.

In [50]:
from sklearn.linear_model import LogisticRegression

In [51]:
sentiment_model = LogisticRegression()

In [52]:
sentiment_model.fit(train_matrix, train_data.sentiment)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 8. There should be over 100,000 coefficients in this sentiment_model. Recall from the lecture that positive weights w_j correspond to weights that cause positive sentiment, while negative weights correspond to negative sentiment. Calculate the number of positive (>= 0, which is actually nonnegative) coefficients.

In [128]:
coefficients = sentiment_model.coef_
coefficients = coefficients.tolist()[0]

positive_coefficients = 0
negative_coefficients = 0
for c in coefficients:
    if c > 0:
        positive_coefficients = positive_coefficients + 1
        
positive_coefficients

87243

## Quiz Answer: Number of positive coefficients = 87243