In [1]:
# Import Required Libraries 
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 
from collections import defaultdict
from nltk import word_tokenize
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
# Build Lemmatizer class for Count Vectorizer 

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['N'] = wn.NOUN

class LemmaTokenizer(object):

  def __init__(self):
   self.wnl = WordNetLemmatizer()

  def __call__(self, text):
   lemmatized = []
   for token, tag in pos_tag(word_tokenize(text)):
      lemmatized.append(self.wnl.lemmatize(token.lower(), tag_map[tag[0]]))

   return lemmatized

In [3]:
# Load Training and Validation Datasets
train = pd.read_csv('training.csv')
train.head()
valid = pd.read_csv('valid.csv')

In [4]:
# Build Pipeline for Pre-processing and training
text_classifier = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True, tokenizer=LemmaTokenizer())),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='log', penalty='l2',
                          alpha=1e-3, random_state=23,
                          max_iter=250, tol=None, learning_rate='optimal')),
])

text_classifier.fit(train['Review'], train['Sentiment'])

predicted = text_classifier.predict(valid['Review'])


In [5]:
# Evaluation Metrics
# Accuracy
report = metrics.classification_report(y_true=valid['Sentiment'], y_pred=predicted, output_dict=True)
acc = report['accuracy']
print("Accuracy",round(acc*100,3),'%')

# F1-Score
f1 = round(metrics.f1_score(y_pred=predicted, y_true=valid['Sentiment'], average='macro'), 3)
print("F1-Score:", f1 )

# Confusion Matrix 
matrix = pd.DataFrame(metrics.confusion_matrix(y_pred=predicted, y_true=valid['Sentiment']))
matrix.columns = ['Negative', 'Neutral', 'Positive']
confusion = matrix.rename(index={0: 'Negative', 1: 'Neutral', 2: "Positive"})
print(" Confusion Matrix:")
print(confusion)


Accuracy 70.339 %
F1-Score: 0.697
 Confusion Matrix:
          Negative  Neutral  Positive
Negative        28       14         6
Neutral          4       21         7
Positive         1        3        34
