## Experiments with ClassifierChain strategy & MLkNN

### Result summary:
#### 1. Preprocess: punct removal, lowercase, tokenization
#### 2. FE: Tf-idf & Tf-idf + bow -> Tf-idf + BoW
#### 3. Classifier: NB, SVM, MLP, XGB, MLkNN -> XGB
### Best: Tf-idf + BoW & XGB

In [52]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [37]:
data = pd.read_csv('data_multilabels_onehot.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,review,ac_P1,air_minum_P1,air_panas_P1,airy_internal,bau_P1,bising,breakfast,fasilitas,fasilitas_inroom,...,rusak,serangga,service,service_front_office,service_house_keeping,service_security,sunrise_meal_P1,tidak_sesuai_pesanan,tv_P1,wifi_P1
0,"Lazy front office girl, no one was there to he...",0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,suka banget tapi sayang kemarin dapat kamar ya...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,kamar saya tidak dapat sabun dan cemilan airy ☹,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,di tengah pusat keramaian... staf hotel yang r...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,closet nya rusak kamar mandi tidak bersih lema...,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [38]:
data.columns

Index(['review', 'ac_P1', 'air_minum_P1', 'air_panas_P1', 'airy_internal',
       'bau_P1', 'bising', 'breakfast', 'fasilitas', 'fasilitas_inroom',
       'fasilitas_non_inroom', 'fisik_bangunan', 'handuk_P1', 'kebersihan',
       'kebersihan_inroom_P1', 'kebersihan_non_inroom', 'linen_P1', 'lokasi',
       'marketing_false', 'overall', 'perlengkapan_mandi_P1', 'rusak',
       'serangga', 'service', 'service_front_office', 'service_house_keeping',
       'service_security', 'sunrise_meal_P1', 'tidak_sesuai_pesanan', 'tv_P1',
       'wifi_P1'],
      dtype='object')

In [39]:
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

In [40]:
train_text = train.review
test_text = test.review

In [41]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_text)
vectorizer.fit(test_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [42]:
x_train = vectorizer.transform(train_text)
x_test = vectorizer.transform(test_text)
y_train = train.drop(columns=['review'], axis=1)
y_test = test.drop(columns=['review'], axis=1)

In [43]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [44]:
clf = ClassifierChain(GaussianNB())
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.10898816509541435


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [45]:
clf = ClassifierChain(LinearSVC())
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.13517388087310397


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [46]:
clf = ClassifierChain(MLPClassifier())
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)



0.17416142557651992


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [47]:
from xgboost import XGBClassifier
clf = ClassifierChain(XGBClassifier())
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.2806994974571936


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [48]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

classifier_new = MLkNN(k=1)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()

# train
classifier_new.fit(x_train, y_train)
# predict
y_pred = classifier_new.predict(x_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.2626533623057952


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [49]:
def tokenize(msg):
    clean = [char for char in msg if char not in string.punctuation]
    clean = ''.join(clean)
    return clean.lower().split()

In [50]:
x_train, x_test, y_train, y_test = \
train_test_split(data.review, data.drop(columns=['review'], axis=1), test_size=0.2, random_state=42)

print(len(x_train), len(x_test), len(x_train) + len(x_test))

159 40 199


In [64]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [86]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('xgb', ClassifierChain(XGBClassifier())),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [87]:
pipeline.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function tokenize at 0x000001CF6810CAE8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=No...ht=1, seed=None,
       silent=True, subsample=1),
        order=None, require_dense=[True, True]))])

In [85]:
y_pred = pipeline.predict(x_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.3349659863945578


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
