## Import Library

In [1]:
import re
import nltk
import string
import codecs
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

warnings.simplefilter(action="ignore", category=FutureWarning)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Preparation

Menyiapkan fungsi-fungsi yang dibutuhkan untuk melakukan pemrosesan seperti pembangkit index embedding, matriks embedding, hingga preprocessing teks. 

In [2]:
lemmatizer = WordNetLemmatizer()

def label_encoding(label):
    if(label == 'fact-based'):
        return 0
    elif(label == 'feeling-based'):
        return 1
    else:
        return 2
    
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):

    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def sentence_pos_tag(sentence):
    text = word_tokenize(sentence)
    pos_tag = nltk.pos_tag(text)
    pos_tag_res = ''
    for i in range(len(pos_tag)):
        pos_tag_res += pos_tag[i][1]
        pos_tag_res += ' ' if i != len(sentence)-1 else '' 
    return pos_tag_res

def preprocessing(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r" \d+ ", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"[^a-z ]", "", text)
    text = re.sub(r"  ", " ", text)
    text = lemmatize_sentence(text)
    return text

In [3]:
import pickle

file = open("../input/quote-response/quotexresponseprocessed.pkl", "rb")
df = pickle.load(file)
file.close()

In [4]:
df = df[df['emotion_fact'] != 'unsure']
df = df.loc[:, ['presented_quote', 'presented_response', 'emotion_fact', 'presented_response_tag']]
df

Unnamed: 0,presented_quote,presented_response,emotion_fact,presented_response_tag
0,i get a good idea however they do tend to stay...,by your own admission you havent hang out with...,feeling-based,IN PRP$ JJ NN PRP VBP VB RP IN NN IN DT NN CC ...
2,one of the big argument against gun control be...,not quite to be more correct regard government...,fact-based,RB RB TO VB RBR JJ JJ NN CC VB DT NN TO VB NN ...
4,there be some incedents that be beyond your co...,well yes,feeling-based,RB RB
6,legality do not matter religous implication do...,exact to the point amp beautiful,feeling-based,NN TO DT NN NN NN
8,once again you seem to support the killing of ...,base on the idea that people be dispensible pa...,feeling-based,NN IN DT NN WDT NNS VB JJ RB IN PRP VBP PRP$ V...
...,...,...,...,...
9977,the id movement form of id state that there be...,that of course be the logical fallacy know as ...,fact-based,DT IN NN VB DT JJ NN VBP IN JJ NN DT NN PRP VB...
9978,for me it would therefore have make no differe...,it logically follow from the moral foundation ...,feeling-based,PRP RB VBP IN DT JJ NN VBN RP CC PRP VBP TO JJ...
9979,good thing this argument have never be doneoh ...,and teen sex doesnt by the very nature of its ...,feeling-based,CC JJ NN NN IN DT JJ NN IN PRP$ NN NN IN NN DT...
9980,i know one thing anything that happen politica...,wasnt sinjin crow about his plan to take the f...,feeling-based,NN NN NN IN PRP$ NN TO VB DT NN NN CC NN TO DT...


In [5]:
# df['presented_quote'] = df['presented_quote'].apply(lambda x: preprocessing(x))
# df['presented_response'] = df['presented_response'].apply(lambda x: preprocessing(x))
df['emotion_fact'] = df['emotion_fact'].apply(lambda x: label_encoding(x))

In [6]:
from sklearn.model_selection import train_test_split

X = df.loc[:, ['presented_quote', 'presented_response', 'presented_response_tag']]
y = df.loc[:, ['emotion_fact']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=20)

In [7]:
X_train_quotes = X_train['presented_response'].values
X_train_responses = X_train['presented_response'].values
X_train_responses_tag = X_train['presented_response_tag'].values

X_test_quotes = X_test['presented_response'].values
X_test_responses = X_test['presented_response'].values
X_test_responses_tag = X_test['presented_response_tag'].values

y_train = y_train['emotion_fact'].values
y_test = y_test['emotion_fact'].values

X_train_text = X_train_quotes + X_train_responses + X_train_responses_tag

In [8]:
print(X_train_quotes.shape)
print(X_train_responses.shape)
print(X_test_quotes.shape)
print(X_test_responses.shape)

(4972,)
(4972,)
(878,)
(878,)


In [9]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [10]:
vectorizer = TfidfVectorizer(max_features=3000)

vectorizer.fit(X_train_text)

X_train_quotes_ = vectorizer.transform(X_train_quotes)
X_train_responses_ = vectorizer.transform(X_train_responses)
X_train_responses_tag_ = vectorizer.transform(X_train_responses_tag)

X_test_quotes_ = vectorizer.transform(X_test_quotes)
X_test_responses_ = vectorizer.transform(X_test_responses)
X_test_responses_tag_ = vectorizer.transform(X_test_responses_tag)

In [11]:
print(X_train_responses_tag_[0])

  (0, 2831)	0.10728743572536519
  (0, 2830)	0.3919208894401395
  (0, 2826)	0.19757641622087066
  (0, 2706)	0.06719069394572035
  (0, 2158)	0.11069042500500868
  (0, 2106)	0.21572161635528578
  (0, 1759)	0.2499519930959555
  (0, 1757)	0.5554732613352785
  (0, 1428)	0.2528894682031386
  (0, 1314)	0.24827234782857924
  (0, 913)	0.15056369450292253
  (0, 785)	0.45695817075416467
  (0, 379)	0.06670252130747925


In [12]:
clf = MultinomialNB().fit(X_train_responses_, y_train)
predicted = clf.predict(X_test_responses_)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.73      0.89      0.80       524
           1       0.75      0.52      0.61       354

    accuracy                           0.74       878
   macro avg       0.74      0.70      0.71       878
weighted avg       0.74      0.74      0.73       878



In [13]:
clf = XGBClassifier().fit(X_train_responses_, y_train)
predicted = clf.predict(X_test_responses_)
print(classification_report(y_test, predicted))





              precision    recall  f1-score   support

           0       0.78      0.80      0.79       524
           1       0.69      0.66      0.67       354

    accuracy                           0.74       878
   macro avg       0.73      0.73      0.73       878
weighted avg       0.74      0.74      0.74       878



In [14]:
clf = SVC().fit(X_train_responses_, y_train)
predicted = clf.predict(X_test_responses_)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       524
           1       0.71      0.63      0.67       354

    accuracy                           0.75       878
   macro avg       0.74      0.73      0.73       878
weighted avg       0.75      0.75      0.75       878



In [15]:
clf = RandomForestClassifier().fit(X_train_responses_, y_train)
predicted = clf.predict(X_test_responses_)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.74      0.85      0.79       524
           1       0.71      0.55      0.62       354

    accuracy                           0.73       878
   macro avg       0.72      0.70      0.71       878
weighted avg       0.73      0.73      0.72       878



In [16]:
clf = AdaBoostClassifier().fit(X_train_responses_, y_train)
predicted = clf.predict(X_test_responses_)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74       524
           1       0.61      0.59      0.60       354

    accuracy                           0.69       878
   macro avg       0.67      0.67      0.67       878
weighted avg       0.68      0.69      0.68       878

