In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

import re  
import nltk  
from sklearn.datasets import load_files  
nltk.download('stopwords')  
import pickle  
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amyscott/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


For this challenge I set out to use the words in the reviews to determine if the reviews were positive or negative. This data took some cleaning and reorganizing the features to get to where I wanted, and I will walk you through the steps that I took to determine the outcome of the reviews. 

In [3]:
df=pd.read_json('reviews_Digital_Music_5.json.gz', lines=True)
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,5555991584,"[3, 3]",5,"It's hard to believe ""Memory of Trees"" came ou...","09 12, 2006",A3EBHHCZO6V2A4,"Amaranth ""music fan""",Enya's last great album,1158019200
1,5555991584,"[0, 0]",5,"A clasically-styled and introverted album, Mem...","06 3, 2001",AZPWAXJG9OJXV,bethtexas,Enya at her most elegant,991526400
2,5555991584,"[2, 2]",5,I never thought Enya would reach the sublime h...,"07 14, 2003",A38IRL0X2T4DPF,bob turnley,The best so far,1058140800
3,5555991584,"[1, 1]",5,This is the third review of an irish album I w...,"05 3, 2000",A22IK3I6U76GX0,Calle,Ireland produces good music.,957312000
4,5555991584,"[1, 1]",4,"Enya, despite being a successful recording art...","01 17, 2008",A1AISPOIIHTHXX,"Cloud ""...""",4.5; music to dream to,1200528000


In this cell I changed the overall score reviews to either 1 if it was greater than 3 on a 1 to 5 scale and if less then 3 it was changed to 0. By doing this I was able make the data binary and I could use more models to help classify and predict the reviews later on. 

In [4]:
df['feedback'] = df['overall'].apply(lambda x: 1 if x > 3 else 0)
print(df)

             asin     helpful  overall  \
0      5555991584      [3, 3]        5   
1      5555991584      [0, 0]        5   
2      5555991584      [2, 2]        5   
3      5555991584      [1, 1]        5   
4      5555991584      [1, 1]        4   
5      5555991584    [62, 65]        5   
6      5555991584      [1, 5]        3   
7      5555991584      [5, 5]        5   
8      5555991584      [4, 4]        5   
9      5555991584    [12, 12]        5   
10     5555991584      [2, 3]        5   
11     5555991584      [1, 1]        4   
12     5555991584      [1, 1]        5   
13     5555991584    [12, 13]        5   
14     5555991584      [2, 2]        4   
15     5555991584      [3, 4]        5   
16     5555991584      [0, 0]        5   
17     5555991584      [2, 3]        5   
18     5555991584      [2, 2]        4   
19     5555991584      [8, 9]        5   
20     5555991584      [0, 0]        5   
21     5555991584      [3, 4]        5   
22     5555991584      [4, 5]     

In [5]:
X= df.summary
y = df.feedback  

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amyscott/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=15, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(documents).toarray()  

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer  
tfidfconverter = TfidfTransformer()  
X = tfidfconverter.fit_transform(X).toarray() 


In [10]:
from sklearn.model_selection import train_test_split 
y= df['feedback']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
importance = classifier.feature_importances_
print (importance)

[0.09696791 0.16737487 0.01544519 0.02749133 0.14770603 0.06166221
 0.13087292 0.02013318 0.05387266 0.03143435 0.05417279 0.01373465
 0.03710737 0.04173997 0.10028456]


In [15]:
indices = np.argsort(importance)[::-1]

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importance[indices[f]]))

1. feature 1 (0.167375)
2. feature 4 (0.147706)
3. feature 6 (0.130873)
4. feature 14 (0.100285)
5. feature 0 (0.096968)
6. feature 5 (0.061662)
7. feature 10 (0.054173)
8. feature 8 (0.053873)
9. feature 13 (0.041740)
10. feature 12 (0.037107)
11. feature 9 (0.031434)
12. feature 3 (0.027491)
13. feature 7 (0.020133)
14. feature 2 (0.015445)
15. feature 11 (0.013735)


In [65]:
y_pred = classifier.predict(X_test) 

In [66]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[    7  2460]
 [   10 10465]]
              precision    recall  f1-score   support

           0       0.41      0.00      0.01      2467
           1       0.81      1.00      0.89     10475

   micro avg       0.81      0.81      0.81     12942
   macro avg       0.61      0.50      0.45     12942
weighted avg       0.73      0.81      0.73     12942

0.8091485087312625


In [67]:
with open('text_classifier', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)

In [68]:
with open('text_classifier', 'rb') as training_model:  
    model = pickle.load(training_model)

In [69]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2))  
print(accuracy_score(y_test, y_pred2))

[[    7  2460]
 [   10 10465]]
              precision    recall  f1-score   support

           0       0.41      0.00      0.01      2467
           1       0.81      1.00      0.89     10475

   micro avg       0.81      0.81      0.81     12942
   macro avg       0.61      0.50      0.45     12942
weighted avg       0.73      0.81      0.73     12942

0.8091485087312625
