# Logistic Regression Implementation Using pandas and scikit-learn

In [9]:
import pandas as pandas
products = pandas.read_csv("amazon_baby.csv")  # read csv to pandas df
products = products.fillna({'review':''})  # fill in N/A's in the review column
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [10]:
# a simple method to remove punctuation from review text
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

# apply the remove_punctuation method on the review column and save to a new review_clean column
products['review_clean'] = products.apply(lambda row: remove_punctuation(row['review']), axis=1)
products.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [11]:
# since 3 star ratings are neutral, ignore them.
products = products[products['rating'] != 3]

In [12]:
# a simple function to get positive or negative sentiment based on rating
def get_sentiment(rating):
    return +1 if rating > 3 else -1

# apply the get_sentiment method on the rating column and save to a new sentiment column
products['sentiment'] = products.apply(lambda row : get_sentiment(row['rating']), axis=1)
products.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [13]:
# get train data
import json
# load the train data ids from json file
with open("train-idx.json") as trainfile:
    train_index = json.load(trainfile)
# select training data using the train ids
train_data = products.iloc[train_index,:]

# get test data
with open("test-idx.json") as testfile:
    test_index = json.load(testfile)
test_data = products.iloc[test_index,:]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

# Using LogisticRegression

In [15]:
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression()
# train the logistic regression algorithm using train matrix and sentiment output
sentiment_model = alg.fit(train_matrix, train_data['sentiment'])

In [16]:
coeffs = sentiment_model.coef_[0]
print('Coefficients Length', coeffs.size)
print('Positive Coefficients', (coeffs >= 0).sum())

('Coefficients Length', 121712)
('Positive Coefficients', 85911)


In [17]:
# inspecting a few test data samples
sample_test_data = test_data[10:13]
print('First sample test data review \n', sample_test_data['review'].iloc[0])
print("\n")
print('Second sample test data review', sample_test_data['review'].iloc[1])

('First sample test data review \n', 'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.')


('Second sample test data review', 'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.')


In [18]:
# predicting output using our sentiment model
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print('scores', scores)
print([1 if i>=0 else -1 for i in scores])

('scores', array([  5.60798627,  -3.1429946 , -10.44043584]))
[1, -1, -1]


In [19]:
# prediction using predict function
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1])

In [20]:
# predicting probability of output using 
print(sentiment_model.classes_)
print(sentiment_model.predict_proba(sample_test_matrix))

[-1  1]
[[  3.65504085e-03   9.96344959e-01]
 [  9.58631800e-01   4.13681997e-02]
 [  9.99970774e-01   2.92256134e-05]]


In [21]:
# using numpy to calculate the probabilistic score for classifier
import numpy as np
print(1/(1+np.exp(-1*scores)))
# Last one has lowest probab to classify as +ve review

[  9.96344959e-01   4.13681997e-02   2.92256134e-05]


# Calculate probability for all test data

In [22]:
# get test matrix for all test data
test_matrix = vectorizer.transform(test_data['review_clean'])
final_scores = sentiment_model.decision_function(test_matrix)
final_probab_scores = 1/(1+np.exp(-1*final_scores))
final_probab_scores.sort()
print(final_probab_scores[::-1])

[  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   8.27723545e-14
   1.73290201e-15   8.93702816e-16]


In [23]:
# using predict_proba function of scikit learn to get probabilistic scores for classifier
predictions = sentiment_model.predict_proba(test_matrix)
positive_predictions = predictions[:,1]

# create a data frame in python to hold product name and probability value
positive_predictions_copy = positive_predictions
products_name = test_data['name']
predict_details = np.array([positive_predictions, products_name]).T
df = pandas.DataFrame(predict_details)
df.columns = ['predictions', 'name']

# sort df to get 20 most positive reviews
df = df.sort_values(by=['predictions'], ascending=[False])
print(df.head(20))

      predictions                                               name
20743           1  Fisher-Price Cradle 'N Swing,  My Little Snuga...
30634           1  Graco FastAction Fold Jogger Click Connect Str...
24899           1         Graco Pack 'n Play Element Playard - Flint
21531           1  Roan Rocco Classic Pram Stroller 2-in-1 with B...
17558           1  Freemie Hands-Free Concealable Breast Pump Col...
25554           1         Diono RadianRXT Convertible Car Seat, Plum
9555            1  Evenflo X Sport Plus Convenience Stroller - Ch...
18112           1  Infantino Wrap and Tie Baby Carrier, Black Blu...
9125            1         P'Kolino Silly Soft Seating in Tias, Green
32782           1      Mamas &amp; Papas 2014 Urbo2 Stroller - Black
26830           1  Baby Jogger City Mini GT Single Stroller, Shad...
24286           1                  Britax 2012 B-Agile Stroller, Red
30535           1  Buttons Cloth Diaper Cover - One Size - 8 Colo...
11923           1       Evenflo 6 

In [24]:
# get 20 most negative reviewed products
df.tail(20)

Unnamed: 0,predictions,name
5831,1.6146e-09,"Regalo My Cot Portable Bed, Royal Blue"
15062,1.49428e-09,"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs"
205,1.10018e-09,Safety 1st Deluxe 4-in-1 Bath Station
28120,8.2667e-10,VTech Communications Safe &amp; Sound Digital ...
27231,7.09893e-10,NUK Cook-n-Blend Baby Food Maker
7310,6.50982e-10,Chicco Cortina KeyFit 30 Travel System in Adve...
31226,6.17932e-10,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...
13751,6.05155e-10,"Peg-Perego Tatamia High Chair, White Latte"
10814,4.57446e-10,Ellaroo Mei Tai Baby Carrier - Hershey
1810,4.42926e-10,Cosco Alpha Omega Elite Convertible Car Seat


# Compute accuracy of classifier

In [25]:
predictions = sentiment_model.predict(test_matrix)
accuracy = len(test_data[test_data['sentiment'] == predictions])/float(len(predictions))
print('Sentiment Model accuracy is', accuracy)

('Sentiment Model accuracy is', 0.9322354211663066)


# Another classifier with fewer words

In [26]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [27]:
# fit a different model using the train matrix for fixed words and train output
alg2 = LogisticRegression()
simple_model = alg2.fit(train_matrix_word_subset, train_data['sentiment'])

In [28]:
# create a dataframe to store words and their coefficients
simple_model_coef_table = pandas.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [29]:
# get most positive words by sorting
simple_model_coef_table.sort_values(by=['coefficient'], ascending=[False])

Unnamed: 0,coefficient,word
6,1.673074,loves
5,1.509812,perfect
0,1.36369,love
2,1.192538,easy
1,0.944,great
4,0.520186,little
7,0.50376,well
8,0.190909,able
3,0.085513,old
9,0.058855,car


In [30]:
sentiment_model.coef_.flatten()

array([ -1.23889324e+00,   1.59863291e-04,   2.63828080e-02, ...,
         1.17685365e-02,   3.10346626e-03,  -6.36644403e-05])

# Comparing Sentiment Model and Simple Model

In [31]:
# accuracy of sentiment model on train data
predictions = sentiment_model.predict(train_matrix)
accuracy = len(train_data[train_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.96800233855


In [32]:
# accuracy of simple model on train data
predictions = simple_model.predict(train_matrix_word_subset)
accuracy = len(train_data[train_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.866822570007


In [33]:
# accuracy of sentiment model on test data
predictions = sentiment_model.predict(test_matrix)
accuracy = len(test_data[test_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.932235421166


In [34]:
# accuracy of simple model on test data
predictions = simple_model.predict(test_matrix_word_subset)
accuracy = len(test_data[test_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.869360451164


In [35]:
# accuracy of majority class classifier for test data
accuracy = len(test_data[test_data['sentiment'] == 1])/float(len(predictions))
print(accuracy)

0.842782577394
