In [4]:
import pandas as pd
import numpy as np
import string

In [5]:
# Load the dataset from csv to pandas
dataDir = '/Users/sachinjain/personal/hdrive/data/courses/coursera-ml-foundations/amazon_product_reviews/'
dataPath = dataDir + 'amazon_baby.csv'
trainDataIndexFile = dataDir + 'train-idx.json'
testDataIndexFile = dataDir + 'test-idx.json'


products = pd.read_csv(dataPath)

products.count()

name      183213
review    182702
rating    183531
dtype: int64

In [6]:
def remove_punctuation(text):
    # This uses the 3-argument version of str.maketrans
    # with arguments (x, y, z) where 'x' and 'y'
    # must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z'
    # is a string (string.punctuation here)
    # where each character in the string is mapped
    # to None
    translator = str.maketrans('', '', string.punctuation)
    
    if type(text) is str:
        return text.translate(translator)
    else:
        return text

In [7]:
remove_punctuation(2)
remove_punctuation('sachin.jain@gmail.com')

'sachinjaingmailcom'

In [8]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [9]:
# Add a new column to products data by removing punctuation in the reviews
products['review_clean'] = products['review'].apply(remove_punctuation)

### Fill all N/A values with empty strings

In [10]:
products = products.fillna({'review_clean':''})
products = products.fillna({'review':''})

### We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.

In [11]:
products = products[products['rating'] != 3]

In [12]:
products.count()

name            166456
review          166752
rating          166752
review_clean    166752
dtype: int64

In [13]:
products.head()

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...


### Convert 4,5 rating to +1 label and 1,2 rating to -1 label

In [14]:
products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating > 3 else -1)

In [15]:
products.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


### Split the data into train_data and test_data

In [16]:
def get_indices(filePath):
    with open(filePath, 'r') as fp:
        indices_as_strings = fp.read()[1:-1].split(', ')

    indices_as_numbers = [ int(x) for x in indices_as_strings ]
    
    return indices_as_numbers

In [17]:
train_indices = get_indices(trainDataIndexFile)
test_indices = get_indices(testDataIndexFile)

In [18]:
train_data = products.take(train_indices, is_copy=True)
test_data = products.take(test_indices, is_copy=True)

In [19]:
print(products.info())
print(train_data.info())
print(test_data.info())

train_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 166752 entries, 1 to 183530
Data columns (total 5 columns):
name            166456 non-null object
review          166752 non-null object
rating          166752 non-null int64
review_clean    166752 non-null object
sentiment       166752 non-null int64
dtypes: int64(2), object(3)
memory usage: 7.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 133416 entries, 1 to 183529
Data columns (total 5 columns):
name            133174 non-null object
review          133416 non-null object
rating          133416 non-null int64
review_clean    133416 non-null object
sentiment       133416 non-null int64
dtypes: int64(2), object(3)
memory usage: 6.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33336 entries, 9 to 183530
Data columns (total 5 columns):
name            33282 non-null object
review          33336 non-null object
rating          33336 non-null int64
review_clean    33336 non-null object
sentiment       33336 no

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


### Build sparse matrix of word count vectors from train data and test data

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

### 7. Learn a logistic regression classifier using the training data. If you are using scikit-learn, you should create an instance of the LogisticRegression class and then call the method fit() to train the classifier. This model should use the sparse word count matrix (train_matrix) as features and the column sentiment of train_data as the target. Use the default values for other parameters. Call this model sentiment_model.

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
sentiment_model = LogisticRegression()

In [23]:
sentiment_model.fit(train_matrix, train_data.sentiment)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 8. There should be over 100,000 coefficients in this sentiment_model. Recall from the lecture that positive weights w_j correspond to weights that cause positive sentiment, while negative weights correspond to negative sentiment. Calculate the number of positive (>= 0, which is actually nonnegative) coefficients.

In [24]:
coefficients = sentiment_model.coef_
coefficients = coefficients.tolist()[0]

positive_coefficients = 0
negative_coefficients = 0
for c in coefficients:
    if c > 0:
        positive_coefficients = positive_coefficients + 1
    else:
        negative_coefficients = negative_coefficients + 1
        
print (sentiment_model.coef_.size)
print (positive_coefficients)
print (negative_coefficients)

121712
87243
34469


## Quiz Answer: Number of positive coefficients = 87243

In [25]:
sample_test_data = test_data[10:13]
sample_test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


### Python Pandas Tip: We use iloc in pandas dataframe to get the row at particular index 

In [26]:
print(sample_test_data.iloc[0]['review'])
print("\n")
print(sample_test_data.iloc[1]['review'])

Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.


Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.


###  10. We will now make a class prediction for the sample_test_data. The sentiment_model should predict +1 if the sentiment is positive and -1 if the sentiment is negative. Recall from the lecture that the score (sometimes called margin) for the logistic regression model is defined as:

> scorei=w⊺h(xi)

> where h(xi) represents the features for data point i. We will write some code to obtain the scores. For each row, the score (or margin) is a number in the range (-inf, inf). Use a pre-built function in your tool to calculate the score of each data point in sample_test_data. In scikit-learn, you can call the decision_function() function.

In [27]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
sample_test_data['score'] = sentiment_model.decision_function(sample_test_matrix)
sample_test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,name,review,rating,review_clean,sentiment,score
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1,5.601538
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1,-3.170456
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1,-10.42328


### 11. These scores can be used to make class predictions as follows:

> y^i={+1 if w⊺h(xi)>0 and -1 if w⊺h(xi)≤0 }

Using scores, write code to calculate predicted labels for sample_test_data.

Checkpoint: Make sure your class predictions match with the ones obtained from sentiment_model. The logistic regression classifier in scikit-learn comes with the predict function for this purpose.

In [28]:
sample_test_data['predicted_label'] = sample_test_data['score'].apply(lambda score: +1 if score > 0 else -1 )
sample_test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name,review,rating,review_clean,sentiment,score,predicted_label
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1,5.601538,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1,-3.170456,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1,-10.42328,-1


### 12. Recall from the lectures that we can also calculate the probability predictions from the scores using:

> P(yi=+1|xi,w)=1/(1+exp(−w⊺h(xi))

Using the scores calculated previously, write code to calculate the probability that a sentiment is positive using the above formula. For each row, the probabilities should be a number in the range [0, 1].

Checkpoint: Make sure your probability predictions match the ones obtained from sentiment_model.

In [29]:
import math
sample_test_data['prob'] = sample_test_data['score'].apply(lambda score: 1/(1 + math.exp(-score)))
sample_test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,name,review,rating,review_clean,sentiment,score,predicted_label,prob
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1,5.601538,1,0.996321
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1,-3.170456,-1,0.040293
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1,-10.42328,-1,3e-05


## Quiz question: Of the three data points in sample_test_data, which one (first, second, or third) has the lowest probability of being classified as a positive review?

Answer: Third

### 13. We now turn to examining the full test dataset, test_data, and use sklearn.linear_model.LogisticRegression to form predictions on all of the test data points.

Using the sentiment_model, find the 20 reviews in the entire test_data with the highest probability of being classified as a positive review. We refer to these as the "most positive reviews."

To calculate these top-20 reviews, use the following steps:

1. Make probability predictions on test_data using the sentiment_model.
2. Sort the data according to those predictions and pick the top 20.

In [30]:
test_data['probs'] = sentiment_model.predict_proba(test_matrix)[:,1]
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name,review,rating,review_clean,sentiment,probs
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.784503
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,0.933192
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999979
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,0.980231


In [31]:
test_data.sort_values('probs', ascending=False)

Unnamed: 0,name,review,rating,review_clean,sentiment,probs
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1,1.000000e+00
80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,I just tried this hands free breastpump bra an...,1,1.000000e+00
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,1.000000e+00
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1,1.000000e+00
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,1.000000e+00
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1,1.000000e+00
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,1.000000e+00
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.000000e+00
114796,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1,1.000000e+00
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,1.000000e+00


## Quiz Question: Which of the following products are represented in the 20 most positive reviews?

Answer: Mamas and Papas , Simple Wishes Hands Free, Infantino etc etc.

### 14. Now, let us repeat this exercise to find the "most negative reviews." Use the prediction probabilities to find the 20 reviews in the test_data with the lowest probability of being classified as a positive review. Repeat the same steps above but make sure you sort in the opposite order.

In [32]:
test_data.sort_values('probs')

Unnamed: 0,name,review,rating,review_clean,sentiment,probs
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,8.438831e-16
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,1.603194e-15
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,8.166342e-14
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,9.849656e-14
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,1.925660e-13
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,3.327760e-13
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,3.274747e-11
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,3.321734e-11
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,9.477431e-11
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,9.613956e-11


Answer: FIsher price ocean wodners, levana baby monitor etc.

## Compute accuracy of the classifier

### 16. There were a lot of words in the model we trained above. We will now train a simpler logistic regression model using only a subet of words that occur in the reviews. For this assignment, we selected 20 words to work with. These are:

significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 
  'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']


In [33]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 'work', 'product', 'money', 'would', 'return']

### Compute a new set of word count vectors using only these words. The CountVectorizer class has a parameter that lets you limit the choice of words when building word count vectors:

### Compute word count vectors for the training and test data and obtain the sparse matrices train_matrix_word_subset and test_matrix_word_subset, respectively.


In [34]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

## Train a logistic regression model on a subset of data

### 17. Now build a logistic regression classifier with train_matrix_word_subset as features and sentiment as the target. Call this model simple_model.

In [35]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data.sentiment)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 18. Let us inspect the weights (coefficients) of the simple_model. First, build a table to store (word, coefficient) pairs. If you are using SFrame with scikit-learn, you can combine words with coefficients by running

In [36]:
simple_model.coef_.flatten()

array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
       -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
       -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109])

In [37]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'simple_model_coefficient':simple_model.coef_.flatten()})

In [38]:
simple_model_coef_table.head()

Unnamed: 0,simple_model_coefficient,word
0,1.36369,love
1,0.944,great
2,1.192538,easy
3,0.085513,old
4,0.520186,little


### Sort the data frame by the coefficient value in descending order.

In [39]:
simple_model_coef_table.sort_values('simple_model_coefficient', ascending=False)

Unnamed: 0,simple_model_coefficient,word
6,1.673074,loves
5,1.509812,perfect
0,1.36369,love
2,1.192538,easy
1,0.944,great
4,0.520186,little
7,0.50376,well
8,0.190909,able
3,0.085513,old
9,0.058855,car


### Quiz Question: Consider the coefficients of simple_model. How many of the 20 coefficients (corresponding to the 20 significant_words) are positive for the simple_model?

Answer: 10


### Quiz Question: Are the positive words in the simple_model also positive words in the sentiment_model?

In [40]:
all_words = vectorizer.vocabulary_.keys()

# Python technique to convert a dict_keys to list of keys
all_words = list(all_words)

In [41]:
sentiment_model_coeff_table = pd.DataFrame({ 'word': all_words, 'coefficients': sentiment_model.coef_.flatten()})

In [42]:
common_rows = pd.merge(sentiment_model_coeff_table, simple_model_coef_table, how='inner', on=['word'])
common_rows

Unnamed: 0,coefficients,word,simple_model_coefficient
0,0.002855448,disappointed,-2.348298
1,0.2670332,love,1.36369
2,0.02334827,product,-0.320556
3,5.687974e-07,well,0.50376
4,0.01044212,loves,1.673074
5,-0.3146971,little,0.520186
6,-0.005469662,easy,1.192538
7,9.087025e-06,work,-0.621169
8,0.06482916,great,0.944
9,0.2199914,would,-0.362167


### Quiz Question: Are the positive words in the simple_model also positive words in the sentiment_model?



Answer: No

## Comparing models

### 19. We will now compare the accuracy of the sentiment_model and the simple_model.

First, compute the classification accuracy of the sentiment_model on the train_data.

Now, compute the classification accuracy of the simple_model on the train_data.

In [78]:
def get_classification_accuracy(model, matrix_data, data):
    # First get the predictions
    data['predicted_sentiment_tmp'] = model.predict(matrix_data)
    
    # Compute the number of correctly classified examples
    matched_rows = data[data['sentiment'] == data['predicted_sentiment_tmp']]
    
    # Then compute accuracy by dividing num_correct by total number of examples
    print("matched rows")
    print(matched_rows['sentiment'].count())
    print("Totol rows")
    print(data['sentiment'].count())
    
    return (float)(matched_rows['sentiment'].count())/(data['sentiment'].count())

In [81]:
c = get_classification_accuracy(sentiment_model, train_matrix, train_data)
print('Accuracy of sentiment model on train data')
print(c)

c = get_classification_accuracy(simple_model, train_matrix_word_subset, train_data)
print('Accuracy of simple model on train data')
print(c)

matched rows
129212
Totol rows
133416
Accuracy of sentiment model on train data
0.968489536487
matched rows
115648
Totol rows
133416
Accuracy of simple model on train data
0.866822570007


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [83]:
c = get_classification_accuracy(sentiment_model, test_matrix, test_data)
print('Accuracy of sentiment model on test data')
print(c)

c = get_classification_accuracy(simple_model, test_matrix_word_subset, test_data)
print('Accuracy of simple model on test data')
print(c)

matched rows
31079
Totol rows
33336
Accuracy of sentiment model on test data
0.932295416367
matched rows
28981
Totol rows
33336
Accuracy of simple model on test data
0.869360451164


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Baseline: Majority class prediction

### 21. It is quite common to use the majority class classifier as the a baseline (or reference) model for comparison with your classifier model. The majority classifier model predicts the majority class for all data points. At the very least, you should healthily beat the majority class classifier, otherwise, the model is (usually) pointless.

Quiz Question: Enter the accuracy of the majority class classifier model on the test_data. Round your answer to two decimal places (e.g. 0.76).

Quiz Question: Is the sentiment_model definitely better than the majority class classifier (the baseline)?

In [85]:
num_positive = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print(num_positive)
print(num_negative)

112164
21252


In [86]:
num_positive  = (test_data['sentiment'] == +1).sum()
num_negative = (test_data['sentiment'] == -1).sum()
print(num_positive)
print(num_negative)

28095
5241


In [87]:
### Acuracy of majority classifier in test data
(float)(num_positive)/(test_data['sentiment'].count())

0.84278257739380846