In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.metrics import confusion_matrix

In [2]:
review = pd.read_csv('data/amazon_baby.csv')

In [6]:
len(review)

183531

In [3]:
review = review[:100000]

In [5]:
review.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


## Considering rating 3 as the decision boundary, hence removing all reviews with rating as 3

In [3]:
review_ = review[review['rating']!=3]

In [8]:
len(review_)

166752

In [8]:
review_.head()

Unnamed: 0,name,review,rating
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5


In [84]:
(review_['review'] == np.nan) == True

1         False
2         False
3         False
4         False
5         False
          ...  
183526    False
183527    False
183528    False
183529    False
183530    False
Name: review, Length: 166752, dtype: bool

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

## creating word count vector for our semantic analyzer model

In [9]:
vect = CountVectorizer(analyzer='word')

In [23]:
dic = {
    'product' : ['p1', 'p2', 'p3'],
    'review' : ['very good', 'poor', 'extremely good']
}

df = pd.DataFrame(dic)
df.head()

Unnamed: 0,product,review
0,p1,very good
1,p2,poor
2,p3,extremely good


In [55]:
pd.set_option('display.max_colwidth', -1)
x = vect.fit_transform(dic['review'])
df['count'] = x

In [56]:
vect.get_feature_names()

['extremely', 'good', 'poor', 'very']

In [57]:
x.toarray()

array([[0, 1, 0, 1],
       [0, 0, 1, 0],
       [1, 1, 0, 0]])

In [64]:
df['count']

0      (0, 3)\t1\n  (0, 1)\t1\n  (1, 2)\t1\n  (2, 1)\t1\n  (2, 0)\t1
1      (0, 3)\t1\n  (0, 1)\t1\n  (1, 2)\t1\n  (2, 1)\t1\n  (2, 0)\t1
2      (0, 3)\t1\n  (0, 1)\t1\n  (1, 2)\t1\n  (2, 1)\t1\n  (2, 0)\t1
Name: count, dtype: object

In [65]:
df['count'].map(lambda x: x.toarray())

0    [[0, 1, 0, 1], [0, 0, 1, 0], [1, 1, 0, 0]]
1    [[0, 1, 0, 1], [0, 0, 1, 0], [1, 1, 0, 0]]
2    [[0, 1, 0, 1], [0, 0, 1, 0], [1, 1, 0, 0]]
Name: count, dtype: object

In [59]:
type(df['count'])

pandas.core.series.Series

In [29]:
wc.transform(dic['review'])
wc

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
w_count = vect.fit_transform(review_['review'].values.astype('U'))

In [11]:
review_['word_count'] = list(w_count.toarray())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [39]:
review_.head()

Unnamed: 0,name,review,rating,word_count
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
review_['rating'] = review_['rating'] >= 4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
review_['rating'] = review_['rating'].map(lambda x: 1 if x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
review_.head()

Unnamed: 0,name,review,rating,word_count
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
x = review_.word_count
y = review_.rating

In [16]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size=0.2, random_state=1)

In [None]:
model = linear_model.LogisticRegression()
model.fit(xtrain.tolist(), ytrain.tolist())



In [4]:
model.score(xtest.tolist(), ytest.tolist())

NameError: name 'model' is not defined

In [3]:
confusion_matrix(ytest, model.predict(xtest.tolist()))

NameError: name 'ytest' is not defined

In [30]:
1753 / (1753 + 661)

0.7261806130903066

In [31]:
1753 / (1753 + 507)

0.7756637168141592