### Assignment 1

- Data Set Choose: Reviews of Musical Instrument
- The following data Musical Instrument is from http://jmcauley.ucsd.edu/data/amazon/

In [1]:
# Data Loading
import pandas as pd
import numpy as np
import gzip
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

punctuations = string.punctuation
nlp = spacy.load("en")

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('reviews_Musical_Instruments_5.json.gz')
# print(len(df.overall))
# print(df.reviewText)


#### Classification Method 1: 
- Like binary classification, the approach can be extended to multiclass classification

In [2]:
# Data Preprocessing
# part of the code inspired by : https://github.com/Jcharis/Natural-Language-Processing-Tutorials/blob/master/Text%20Classification%20With%20Machine%20Learning,SpaCy,Sklearn(Sentiment%20Analysis)/Text%20Classification%20&%20Sentiment%20Analysis%20with%20SpaCy,Sklearn.ipynb

def scorePreprocessor(score):
    # 1  -> positive 
    # -1 -> negative
    # 0  -> neutral 
    res = []
    for i in range(len(score)):
        if score[i] > 3.0:
            res.append(1)
        elif score[i] < 3.0:
            res.append(-1)
        else:
            res.append(0)
    return res
    
def handleNegation(review):
    length = len(review)
    i = 0
    negation_String = 'NOT_'
    for i in range(len(review)):
        if review[i] in ["not", "n't", "not"] and i < len(review) -1 :
            # adding negation String
            i = i + 1
            while review[i] not in punctuations and i < len(review) -1:
                review[i] = negation_String + review[i]
                i = i + 1
    return review
            

def textNormalization(review):
    # tokenization and lemmatizing
    stop_words = list(STOP_WORDS)
    result = []
    
    for i in range(len(review)):
        mytokens = nlp(review[i])
        mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
        mytokens = handleNegation(mytokens) 
        # Filtering out punctuations
        mytokens = [ word for word in mytokens if word not in punctuations ]  
        result.append(mytokens)
    
    return result
    #print(np.shape(result))
    #print(result)




#### Traning the sentiment analyser
- First Method, SVM and Naive Bayes Classifier
- Remember : Use Pipeline

In [3]:
# def N_gram(result):
#     # Use the zip function to help us generate n-grams
#     # Concatentate the tokens into ngrams and return
#     ngrams = zip(*[result[i:] for i in range(n)])
#     return [" ".join(ngram) for ngram in ngrams]
    

def stupidBackTosetence(result):
    new_result = []
    for i in range(len(result)):
        sentence = result[i][0]
        for j in range(1, len(result[i])):
            sentence = sentence + " " + result[i][j]
        new_result.append(sentence)
    return new_result


def vectorizeAndNgram(result):
    # verctorize and get feature using bi-gram
    vectorizer = TfidfVectorizer(
                stop_words='english',
                analyzer='word',
                ngram_range=(2, 2),
                max_features=30000)

    # fit_transform on to get the features
    feature = vectorizer.fit_transform(result)
    return feature

def vectorizeNgramBayesPipe():
    text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

    tuned_parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': [1, 1e-1, 1e-2]
    }
    return text_clf, tuned_parameters
    

# splitting data to 80% training_set 10% test_set and 10 %dev_set
def dataSplit(X_all, y_all):
    training_size = np.int(X_all.shape[0] * 0.8)
    print(training_size)
    test_size = np.int(X_all.shape[0] * 0.1)
    X_train = X_all[0:training_size]
    X_test = X_all[training_size :training_size + test_size]
    X_dev = X_all[training_size + test_size:]
    
    y_train = y_all[0:training_size]
    y_test = y_all[training_size :training_size + test_size]
    y_dev = y_all[training_size + test_size:]
                                                   
    #print(np.shape(y_dev))                                            
    return X_train, y_train, X_test, y_test, X_dev, y_dev


scoreTarget = scorePreprocessor(df.overall[0:100])
review_norm = textNormalization(df.reviewText[0:100])
review = stupidBackTosetence(review_norm)
review_processed = vectorizeAndNgram(review)
X_train, y_train, X_test, y_test, X_dev, y_dev = dataSplit(review_processed, scoreTarget)



print(np.shape(X_train))
print(np.shape(X_test))
print(np.shape(X_dev))
#print(X_train)

80
(80, 2175)
(10, 2175)
(10, 2175)


In [4]:
from sklearn.svm import SVC


clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.9


In [8]:
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(review, scoreTarget, test_size=0.2, random_state=42)


text_clf, tuned_parameters = vectorizeNgramBayesPipe()
clf = GridSearchCV(text_clf, tuned_parameters, cv=3, scoring='f1_macro')
clf.fit(X_train, y_train)


print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
                             clf.cv_results_['std_test_score'], 
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test), digits=4))
print()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

Grid scores on development set:

0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__ngram_range': (2, 2)}
0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__ngram_range': (2, 2)}
0.311 (+/-0.011) for {'clf__alpha': 1, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
0.311 (+/-0.011) for {'clf__alpha

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
