# Natural Language Processing -- Big Data Programming 1 Project
## Machine Learning model to predict the rating based on reviews. 

In [133]:
import nltk
import pandas as pd
import string
# do the import for stopwords
from nltk.corpus import stopwords
#Importing CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# Importing some models to test
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
# imports for model evaluations
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

#Multiprocessing
from multiprocessing import Process
import time
from multiprocessing import Pool

In [121]:
# Function to process the text reviews
def process_text(raw_text):
    # Check for the punctuations 
    nopunc = [char for char in raw_text 
              if char not in string.punctuation]

    # Join the characters 
    nopunc = ''.join(nopunc)
    
    # Remove stopwords (if any)
    return [word for word in nopunc.split() 
            if word.lower() not in stopwords.words('english')]

In [122]:
#Rating from stars to good & excellent -- We can leave it with 5 stars as well. 
def define_rating_class(rating):
    stars = [1,2,3]
    if rating in stars:
        #print (rating)
        return 'good'
    else:
        return 'Excellent'

In [123]:
# Evaluating our model -- insted of printing, we can save the report as saperate file as well
def model_evaluation(label, y_test, pred):
    print (label)
    print (confusion_matrix(y_test, pred))
    print (classification_report(y_test, pred))

# Implementation with pipeline feature for model comparison.

In [164]:
#def main(x): # x just to run pool, a test case.
def main():
    #%%time
    df1 = pd.read_csv('data_clean.csv')
    
#    X_train, X_test = train_test_split(df1,
#                                       test_size=0.001,
#                                       random_state=42)
#    df1 = X_test
    df = df1[['rating','reviewText', 'rev_len']]
    """let's deal with the possible NaN 
    Dropping all with NaN
    """
    df=df.dropna()
    """Spliting rating to good (1 to 3 stars) and 
    excellent (4 & 5 Stars)"""
    df['rating']= df['rating'].apply(define_rating_class)
    #Tokenization
    """Let's apply process_text() to the sms column in 
    our dataframe to get the tokens
    """    
    #df['reviewText'].apply(process_text);
    """Let's fit the CountVectorizer() to reviewText column 
    of our dataframe.
    """
    X = df['reviewText']
    y = df['rating']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    
    pipelines = [
        ('MultinomialNB_model', Pipeline([
            ('bow', CountVectorizer(analyzer = process_text)),
            ('tfidf', TfidfTransformer()),
            ('model_nb', MultinomialNB())])
        ),
        ('LogisticRegression_model', Pipeline([
            ('bow', CountVectorizer(analyzer = process_text)),
            ('tfidf', TfidfTransformer()),
            ('model_nb', LogisticRegression())])
        ),
        ('RandomForest_model', Pipeline([
            ('bow', CountVectorizer(analyzer = process_text)),
            ('tfidf', TfidfTransformer()),
            ('model_nb', RandomForestClassifier())])
        )
    ]
    
    for label, pipeline in pipelines:
        pipeline.fit(X_train,y_train)
        pred = pipeline.predict(X_test)
        """Evaluation"""
        model_evaluation(label, y_test, pred)
    #%%time

In [125]:
if __name__ == '__main__':
    start = time.time()
    p = Process(target=main)#, args=('bob',))
    p.start() 
    p.join()
#    main( )
    end = time.time()
    print(end - start)

MultinomialNB_model
[[25404    11]
 [ 7025    66]]
              precision    recall  f1-score   support

   Excellent       0.78      1.00      0.88     25415
        good       0.86      0.01      0.02      7091

   micro avg       0.78      0.78      0.78     32506
   macro avg       0.82      0.50      0.45     32506
weighted avg       0.80      0.78      0.69     32506





LogisticRegression_model
[[24545   870]
 [ 3358  3733]]
              precision    recall  f1-score   support

   Excellent       0.88      0.97      0.92     25415
        good       0.81      0.53      0.64      7091

   micro avg       0.87      0.87      0.87     32506
   macro avg       0.85      0.75      0.78     32506
weighted avg       0.86      0.87      0.86     32506





RandomForest_model
[[25083   332]
 [ 5771  1320]]
              precision    recall  f1-score   support

   Excellent       0.81      0.99      0.89     25415
        good       0.80      0.19      0.30      7091

   micro avg       0.81      0.81      0.81     32506
   macro avg       0.81      0.59      0.60     32506
weighted avg       0.81      0.81      0.76     32506

4049.3036601543427


Link to understand classification report.<br>
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html


## Anything above is final. below are all tests

In [165]:
if __name__ == '__main__':
    start = time.time()
    with Pool(5) as p:
        #p.starmap(main, [1,2])
        p.map(main())#,[1])
#    p = Process(target=main)#, args=('bob',))
    #p.start() 
    #p.join()
#    main( )
    end = time.time()
    print(end - start)

MultinomialNB_model
[[25404    11]
 [ 7025    66]]
              precision    recall  f1-score   support

   Excellent       0.78      1.00      0.88     25415
        good       0.86      0.01      0.02      7091

   micro avg       0.78      0.78      0.78     32506
   macro avg       0.82      0.50      0.45     32506
weighted avg       0.80      0.78      0.69     32506





LogisticRegression_model
[[24545   870]
 [ 3358  3733]]
              precision    recall  f1-score   support

   Excellent       0.88      0.97      0.92     25415
        good       0.81      0.53      0.64      7091

   micro avg       0.87      0.87      0.87     32506
   macro avg       0.85      0.75      0.78     32506
weighted avg       0.86      0.87      0.86     32506





RandomForest_model
[[25062   353]
 [ 5789  1302]]
              precision    recall  f1-score   support

   Excellent       0.81      0.99      0.89     25415
        good       0.79      0.18      0.30      7091

   micro avg       0.81      0.81      0.81     32506
   macro avg       0.80      0.58      0.59     32506
weighted avg       0.81      0.81      0.76     32506



TypeError: map() missing 1 required positional argument: 'iterable'

In [127]:
start = time.time()

end = time.time()
print(end - start)

5.1975250244140625e-05


In [130]:
print(time.time())

1543562593.158515


# Implementation without Pipeline feature - Draft 2 

In [22]:
def main():
    df1 = pd.read_csv('data_clean.csv')
    X_train, X_test = train_test_split(df1,
                                       test_size=0.5,
                                       random_state=42)
    df1 = X_test
    df = df1[['rating','reviewText', 'rev_len']]
    """let's deal with the possible NaN 
    Dropping all with NaN
    """
    df=df.dropna()
    """Spliting rating to good (1 to 3 stars) and 
    excellent (4 & 5 Stars)"""
    df['rating']= df['rating'].apply(define_rating_class)
    #Tokenization
    """Let's apply process_text() to the sms column in 
    our dataframe to get the tokens
    """    
    df['reviewText'].apply(process_text);
    """Let's fit the CountVectorizer() to reviewText column 
    of our dataframe.
    """
    bow_transformer = CountVectorizer(analyzer=process_text
                                     ).fit(df['reviewText'])
    """Now, Let's move on and use .transform with bow_transformer
    and transform the entire DataFrame of sms corpus.
    """
    df_bow = bow_transformer.transform(df['reviewText'])
    #TfidfTransformer
    """#Let's create an instance for TfidfTransformer 
    and fit to the df_bow
    """
    tfidf_trans = TfidfTransformer()
    tfidf_trans.fit(df_bow)
    """lets transfer our entire BoW "sms_bow" to TF-IDF corpus! Instead 
    of passing a single sms as a BoW, we will pass the entire corpus.
    """
    df_tfidf = tfidf_trans.transform(df_bow)
    """
    Naive Bayes is considered as a good choice in text retrieval community.
    With appropriate pre-processing, Naive Bayes is competitive in this domain
    with more advanced methods including support vector machines.
    Let's use the one which is accepted by the experts as a better choice!
    We have imported the multinomial Naive Bayes model above, "MultinomialNB" from sklearn.
    Let's create instance for MultinomialNB now!
    """
    rating_predictions = MultinomialNB()
    """Train test split"""
    X = df_tfidf
    y = df['rating']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.33, 
        random_state=42)
    """training on train data"""
    rating_predictions.fit(X_train, y_train)
    """predictions for the test data"""
    pred = rating_predictions.predict(X_test)
    """Evaluation"""
    model_evaluation(y_test, pred)

In [23]:
if __name__ == '__main__':
    main( )

[[517   0]
 [134   0]]
              precision    recall  f1-score   support

   Excellent       0.79      1.00      0.89       517
        good       0.00      0.00      0.00       134

   micro avg       0.79      0.79      0.79       651
   macro avg       0.40      0.50      0.44       651
weighted avg       0.63      0.79      0.70       651



  'precision', 'predicted', average, warn_for)


# Simple implementation with notebook - individual pipeline for each model - Draft 1

In [6]:
#df = df1[['rating','reviewText', 'rev_len']]
#df.head()

In [335]:
df['rating']= df['rating'].apply(define_rating_class)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [336]:
df['rating'].value_counts()

Excellent    1554
good          417
Name: rating, dtype: int64

In [337]:
df.head()

Unnamed: 0,rating,reviewText,rev_len
79950,good,"I know I have had the mug since Feb, but it ha...",248
83548,good,I bought these ant baits based on the reviews ...,1135
57790,Excellent,"Well, the Kelty guys don't need to be speciall...",874
18965,Excellent,The trampoline arrived quickly and the box was...,2998
33517,Excellent,The Flex belt pads i ordered was the same i re...,101


Some of the message are 'NaN' which generate error, Let's create another dataframe to see what are those messages.  

In [338]:
# how manay are NaN!
sum(pd.isnull(df['reviewText']))

0

In [339]:
# Whic are NaN
df3 = df[pd.isnull(df['reviewText'])]

In [340]:
df3

Unnamed: 0,rating,reviewText,rev_len


In [341]:
#any NaN in rating?
sum(pd.isnull(df['rating']))

0

In [342]:
# Dropping all with NaN
df=df.dropna()

In [343]:
sum(pd.isnull(df['reviewText']))

0

In [344]:
df.head()

Unnamed: 0,rating,reviewText,rev_len
79950,good,"I know I have had the mug since Feb, but it ha...",248
83548,good,I bought these ant baits based on the reviews ...,1135
57790,Excellent,"Well, the Kelty guys don't need to be speciall...",874
18965,Excellent,The trampoline arrived quickly and the box was...,2998
33517,Excellent,The Flex belt pads i ordered was the same i re...,101


### Good to go!

import string
#do the import for stopwords
from nltk.corpus import stopwords

In [345]:
def process_text(raw_text):
    # Check for the punctuations 
    nopunc = [char for char in raw_text 
              if char not in string.punctuation]

    # Join the characters 
    nopunc = ''.join(nopunc)
    
    # Remove stopwords (if any)
    return [word for word in nopunc.split() 
            if word.lower() not in stopwords.words('english')]

Tokenization<br>
Let's apply process_text() to the reviews column in our dataframe to get the tokens

In [346]:
df['reviewText'].apply(process_text);

Vectorization

#Importing CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [347]:
#Let's fit the CountVectorizer() to reviewText column of our dataframe.
bow_transformer = CountVectorizer(
    analyzer=process_text).fit(df['reviewText'])

Now, Let's move on and use .transform with bow_transformer and transform the entire DataFrame of sms corpus.

In [348]:
df_bow = bow_transformer.transform(df['reviewText'])

scikit-learn provides a tool TfidfTransformer, let's do this!

In [349]:
# Doing import 
#from sklearn.feature_extraction.text import TfidfTransformer
#Let's create an instance for TfidfTransformer and fit to the df_bow
tfidf_trans = TfidfTransformer()
tfidf_trans.fit(df_bow)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

lets transfer our entire BoW "sms_bow" to TF-IDF corpus! 
Instead of passing a single sms as a BoW, we will pass the entire corpus.

In [350]:
df_tfidf = tfidf_trans.transform(df_bow)

In [351]:
# Let's check the shape of sms_tfidf
df_tfidf.shape

(1971, 11680)

Now, as we have transformed our data into its vector form, we can train our Machine Learning algorithm.
Its a rating predictions, a classification problem.<br>
Naive Bayes is considered as a good choice in text retrieval. With appropriate pre-processing, Naive Bayes is competitive in this domain with more advanced methods including support vector machines.<br>
Let's work with multinomial Naive Bayes model, and then explore others for comparisons. 

In [352]:
# Let's do the import for Naive Bayes
#from sklearn.naive_bayes import MultinomialNB
# creating instance
rating_predictions = MultinomialNB()

In [353]:
# train test split
#from sklearn.model_selection import train_test_split 
X = df_tfidf
y = df['rating']

In [354]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [355]:
# training on train data
rating_predictions.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [356]:
# predictions for the test data
pred = rating_predictions.predict(X_test)

In [357]:
#from sklearn.metrics import classification_report, confusion_matrix
print (confusion_matrix(y_test, pred))
print (classification_report(y_test, pred))

[[517   0]
 [134   0]]
              precision    recall  f1-score   support

   Excellent       0.79      1.00      0.89       517
        good       0.00      0.00      0.00       134

   micro avg       0.79      0.79      0.79       651
   macro avg       0.40      0.50      0.44       651
weighted avg       0.63      0.79      0.70       651



  'precision', 'predicted', average, warn_for)


Individual Pipeline for each model<br>
The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. We can set up all the transformation, we did during the text processing, in a single unit using pipeline feature of scikit-learn. Rather than doing all steps one-by-one, we can then call that single unit for our data processing. In this way, we save lots of time and there is no need to re-do all the transformation steps manually. A simple call of pipeline object, with stored steps, on the data will do all the processing in future. 
Let's see this will work on our dataset! Let's do the train_test_split() again using our raw data. 

In [358]:
# train_test_split() on the raw data
X = df['reviewText']
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

Creating pipeline, need to do import first

In [359]:
#from sklearn.pipeline import Pipeline

In [360]:
pipeline = Pipeline([
    # Tokenization using scikit's CountVectorizer 
    ('baw', CountVectorizer(analyzer=process_text)),  
    
    # Computing TF-IDF  -- weighted scores
    ('tfidf', TfidfTransformer()),  
    
     # Naive Bayes classifier to train on TF-IDF vectors
    ('model_nb', MultinomialNB()), 
])

In [361]:
#Traing / Fitting using pipeline
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('baw', CountVectorizer(analyzer=<function process_text at 0x1a1adc92f0>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None..._tf=False, use_idf=True)), ('model_nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [362]:
# predictions for the test data
pred = pipeline.predict(X_test)

In [363]:
print (confusion_matrix(y_test, pred))
print (classification_report(y_test, pred))

[[517   0]
 [134   0]]
              precision    recall  f1-score   support

   Excellent       0.79      1.00      0.89       517
        good       0.00      0.00      0.00       134

   micro avg       0.79      0.79      0.79       651
   macro avg       0.40      0.50      0.44       651
weighted avg       0.63      0.79      0.70       651



  'precision', 'predicted', average, warn_for)


### Logistic Regression Classifier

In [364]:
#from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('baw', CountVectorizer(analyzer=process_text)),  
    ('tfidf', TfidfTransformer()),  
    
     # only change is RandomForestClassifier() 
    ('model', LogisticRegression()), 
])
# training the model
pipeline.fit(X_train,y_train)
# doing predictions
pred = pipeline.predict(X_test)
#Evaluation
print (confusion_matrix(y_test, pred))
print (classification_report(y_test, pred))



[[517   0]
 [133   1]]
              precision    recall  f1-score   support

   Excellent       0.80      1.00      0.89       517
        good       1.00      0.01      0.01       134

   micro avg       0.80      0.80      0.80       651
   macro avg       0.90      0.50      0.45       651
weighted avg       0.84      0.80      0.71       651



### Random Forests Classifier

In [365]:
#from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('baw', CountVectorizer(analyzer=process_text)),  
    ('tfidf', TfidfTransformer()),  
    
     # only change is RandomForestClassifier() 
    ('model', RandomForestClassifier()), 
])

# training the model
pipeline.fit(X_train,y_train)
# doing predictions
pred = pipeline.predict(X_test)
#Evaluation
print (confusion_matrix(y_test, pred))
print (classification_report(y_test, pred))



[[509   8]
 [114  20]]
              precision    recall  f1-score   support

   Excellent       0.82      0.98      0.89       517
        good       0.71      0.15      0.25       134

   micro avg       0.81      0.81      0.81       651
   macro avg       0.77      0.57      0.57       651
weighted avg       0.80      0.81      0.76       651



## Below is a rough work to see the multi-processing implementation in Python

In [85]:
from multiprocessing import Process
import os

In [86]:
def info(title):
    print(title)
    print('module name:', __name__)
    print('parent process:', os.getppid())
    print('process id:', os.getpid())

In [78]:
df1 = pd.read_csv('data_clean.csv')
df = df1[['rating','reviewText', 'rev_len']]
df=df.dropna()

In [None]:
df['reviewText'].apply(process_text)

In [87]:
def f():
    info('function f')
    df1 = pd.read_csv('data_clean.csv')
    df = df1[['rating','reviewText', 'rev_len']]
    df=df.dropna()
    df['reviewText'].apply(process_text)

In [89]:
if __name__ == '__main__':
    info('main line')
    p = Process(target=f)#, args=('bob',))
    p.start() 
    p.join()

main line
module name: __main__
parent process: 2113
process id: 7944
function f
module name: __main__
parent process: 7944
process id: 8190


Process Process-4:
Traceback (most recent call last):
  File "/anaconda/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-87-10f983715e87>", line 6, in f
    df['reviewText'].apply(process_text)
  File "/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 2355, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/src/inference.pyx", line 1569, in pandas._libs.lib.map_infer (pandas/_libs/lib.c:66440)
  File "<ipython-input-70-abd044477b15>", line 11, in process_text
    return [word for word in nopunc.split()
  File "<ipython-input-70-abd044477b15>", line 12, in <listcomp>
    if word.lower() not in stopwords.words('english')]
  File "/anaconda/lib/python3.6/site-packages/nltk/corpus/reader/wordlist.py", line 22, in words
    return [line for line in line_token

KeyboardInterrupt: 

  File "/anaconda/lib/python3.6/site-packages/nltk/corpus/reader/wordlist.py", line 23, in <listcomp>
    if not line.startswith(ignore_lines_startswith)]
KeyboardInterrupt
