In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_similarity_score



In [2]:
df = pd.read_csv("../DataProcessing/tone_content_genre_cleaned.tsv")

In [3]:
df.fillna("", inplace=True)

In [4]:
# convert to 0/1 labels
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

labels_list = []
for index,rows in df.iterrows():
    x = df.loc[index,"tone"].split(",")
    labels_list.append(x)

labels = mlb.fit_transform(labels_list)

# check shape
labels.shape

(7005, 59)

In [6]:
# convert to 0/1 genres
genres_list = []
for index, rows in df.iterrows():
    x = df.loc[index, "genre"].split(",")
    genres_list.append(x)

genres = mlb.fit_transform(genres_list)

genres.shape

(7005, 332)

In [7]:
# split train and test
df_train_corpus = pd.DataFrame(df.iloc[:5136,0])
df_test_corpus =  pd.DataFrame(df.iloc[5136:,0])
df_train_genre = genres[:5136]
df_test_genre = genres[5136:]
df_train_label = labels[:5136]
df_test_label = labels[5136:]

In [8]:
# check shape
print(df_train_corpus.shape)
print(df_test_corpus.shape)
print(df_train_genre.shape)
print(df_test_genre.shape)
print(df_train_label.shape)
print(df_test_label.shape)

(5136, 1)
(1869, 1)
(5136, 332)
(1869, 332)
(5136, 59)
(1869, 59)


In [9]:
# clean the text
def CleanText(raw_comment):
    # 1. lower case
    new_comment = raw_comment.lower()
    # 2. remove punctuation
    new_comment = re.sub(r"[^\w\s]", "", new_comment)
    return new_comment

#Remove stop words
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

#Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

In [11]:
# load the tf-idf vectorizer
tfidf_vectorizer = joblib.load("../saved_models/TfidfVectorizer.pkl")

In [12]:
# load the classifier
BinaryClassifier = joblib.load("../saved_models/BinaryClassifierWithGenre.pkl")



### Evaluation Functions

In [13]:
def getPrecision(true_label,predicted_prob, K):
    '''
    Get the precision@K
    '''
    return sum([i[1] for i in sorted(zip(predicted_prob, true_label), key=lambda x: x[0], reverse=True)[:K]])/float(K)

In [14]:
def getRecall(true_label,predicted_prob, K):
    '''
    Get the recall@K
    '''
    return sum([i[1] for i in sorted(zip(predicted_prob, true_label), key=lambda x: x[0], reverse=True)[:K]])/float(len(true_label[true_label == 1]))

In [15]:
def assignValues(predicted_labels_prob):
    '''
    This function takes an array of probabilities and assig 1 to the highest 2 values
    '''
    
    # find the largest index
    first_index = np.argmax(predicted_labels_prob)
    # assign it to 0
    predicted_labels_prob[first_index] = 0
    
    # find the second largest index
    second_index = np.argmax(predicted_labels_prob)
    # assign it to 0
    predicted_labels_prob[second_index] = 0
    
    # assign 0 to 1
    predicted_labels_prob[predicted_labels_prob == 0] = 1
    
    # assign others to 0
    predicted_labels_prob[predicted_labels_prob != 1] = 0

In [16]:
def NewAccuracy(true_labels, predicted_labels):
    '''
    Based on Jaccard Similarity
    '''
    return round(jaccard_similarity_score(true_labels, predicted_labels),2)

In [17]:
def AverageAccuracy(true_labels, predicted_labels):
    
    '''
    This function gives the average accuracy for *each classifier*
    '''
    
    if(true_labels.shape[1] != predicted_labels.shape[1]) or (true_labels.shape[0] != predicted_labels.shape[0]):
        return "Wrong Shape"
    
    NSample = true_labels.shape[0]
    NLabel = true_labels.shape[1]
    
    avg_accuracy = 0
    
    for i in range(0, NLabel):
        N = 0
        for j in range(0, NSample):
            if true_labels[j][i] == predicted_labels[j][i]:
                N += 1
        print("Accuracy for classifier {} is {}".format(i, N/NSample))
        
        avg_accuracy += N/NSample
        
    print("Average accuracy for {} classifiers is {}".format(NLabel, avg_accuracy/NLabel))

In [18]:
true_tones = []
predicted_tones = []

In [120]:
count = 0
for index, rows in df_test_corpus.iterrows():
    
    # for each book
    true_tone = df_test_label[index - 5136]
    reviews = df_test_corpus.loc[index,"content"].split("-----")
    
    # store predicted tones on reviews
    predict_tone = np.zeros(59)
    
    # predict on each review
    for review in reviews:
        # clean the review text
        review_cleaned = stemming(removeStopWords(CleanText(review)))
        
        # generate the tfidf_vector
        tfidf_vector = tfidf_vectorizer.transform([review_cleaned])
        
        # predict in the classifier
        review_predict_tone = BinaryClassifier.predict_proba(tfidf_vector)
        
        predict_tone += review_predict_tone.toarray()[0]
        
    true_tones.append(true_tone)
    
        
    # calculate the mean (actually it is the same with sum)
    # predict_tone = predict_tone/len(reviews)
    
    assignValues(predict_tone)
    
    predicted_tones.append(predict_tone)
    
    if count % 100 == 0:
        print(count)
        
    count += 1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800


In [121]:
# convert list to array
true_tones = np.array(true_tones)
predicted_tones = np.array(predicted_tones)

In [161]:
# hemming loss
hamming_loss(true_tones, predicted_tones) 

0.04678473941471466

### Accuracy of each classifier

In [162]:
AverageAccuracy(true_tones,predicted_tones)

Accuracy for classifier 0 is 0.8710540395933655
Accuracy for classifier 1 is 0.9662921348314607
Accuracy for classifier 2 is 0.550561797752809
Accuracy for classifier 3 is 0.9892990904226859
Accuracy for classifier 4 is 0.9695024077046549
Accuracy for classifier 5 is 0.9951845906902087
Accuracy for classifier 6 is 0.9973247726056714
Accuracy for classifier 7 is 0.9785981808453719
Accuracy for classifier 8 is 0.9721776350989834
Accuracy for classifier 9 is 0.9721776350989834
Accuracy for classifier 10 is 0.9646869983948636
Accuracy for classifier 11 is 0.9844836811128946
Accuracy for classifier 12 is 0.9898341359015517
Accuracy for classifier 13 is 0.9646869983948636
Accuracy for classifier 14 is 0.9604066345639379
Accuracy for classifier 15 is 0.9978598180845372
Accuracy for classifier 16 is 0.994649545211343
Accuracy for classifier 17 is 0.9823434991974318
Accuracy for classifier 18 is 0.9818084537185661
Accuracy for classifier 19 is 0.9759229534510433
Accuracy for classifier 20 is 0.

### Precision and Recall

In [63]:
avg_precision = 0
avg_recall = 0
count = 0

for index, rows in df_test_corpus.iterrows():
    
    # for each book
    true_tone = df_test_label[index - 5136]
    reviews = df_test_corpus.loc[index,"content"].split("-----")
    genre = df_test_genre[index - 5136]
    
    # store predicted tones on reviews
    predict_tone = np.zeros(59)
    
    # predict on each review
    for review in reviews:
        
        # clean the review text
        review_cleaned = stemming(removeStopWords(CleanText(review)))
        
        # generate the tfidf_vector
        tfidf_vector = tfidf_vectorizer.transform([review_cleaned])
        
        # convert tdidf vector to numpy array
        tfidf_vector = tfidf_vector.toarray()[0]
        
        # add into a new feature vector
        feature_vector = np.concatenate([tfidf_vector, genre])
        
        # reshape
        feature_vector = feature_vector.reshape(1, 29684)
        
        # predict in the classifier
        review_predict_tone = BinaryClassifier.predict_proba(feature_vector)
        
        predict_tone += review_predict_tone.toarray()[0]
            
    # calculate the mean (actually it is the same with sum)
    predict_tone = predict_tone/len(reviews)
    
    avg_precision += getPrecision(true_tone, predict_tone, 3)
    avg_recall += getRecall(true_tone, predict_tone, 3)
    
    if count % 200 == 0:
        print(count)
    count += 1
    
print(avg_precision/df_test_corpus.shape[0])
print(avg_recall/df_test_corpus.shape[0])

0
200
400
600
800
1000
1200
1400
1600
1800
0.130669995838538
0.8015694667380061
