In [2]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import loadtxt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, roc_auc_score, hamming_loss, accuracy_score 
from skmultilearn.problem_transform import ClassifierChain
from scipy.sparse import csr_matrix, lil_matrix
%matplotlib inline

# read data
X_train = np.load('./data/X_train.npy')
X_test = np.load('./data/X_test.npy')
y_train = np.load('./data/y_train.npy')
y_test = np.load('./data/y_test.npy')

type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) / Sensing (S)", 
                   "FT: Feeling (F) / Thinking (T)", "JP: Judging (J) / Perceiving (P)"  ]


b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    
    return [b_Pers[l] for l in personality]

def translate_back(personality):
    # transform binary vector to mbti personality
    
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s



### Vectorize with count and tf-idf

Keep words appearing in 10% to 70 % of the posts.

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Posts to a matrix of token counts
cntizer = CountVectorizer(analyzer="word", 
                             max_features=1500, 
                             tokenizer=None,    
                             preprocessor=None, 
                             stop_words=None,  
                             max_df=0.7,
                             min_df=0.1) 

# Learn the vocabulary dictionary and return term-document matrix
print("CountVectorizer...")
X_train = cntizer.fit_transform(X_train)
X_test = cntizer.transform(X_test)
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

print("Tf-idf...")
# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
X_train =  tfizer.fit_transform(X_train).toarray()
X_test =  tfizer.transform(X_test).toarray()
print("MBTI 1st row: %s" % translate_back(y_train[0,:]))
print("Y: Binarized MBTI 1st row: %s" % y_train[0,:])
feature_names = list(enumerate(cntizer.get_feature_names()))
feature_names[:10]

CountVectorizer...
Tf-idf...
MBTI 1st row: ISFP
Y: Binarized MBTI 1st row: [0 1 0 1]


[(0, 'ability'),
 (1, 'able'),
 (2, 'absolutely'),
 (3, 'accept'),
 (4, 'accurate'),
 (5, 'across'),
 (6, 'act'),
 (7, 'action'),
 (8, 'actual'),
 (9, 'actually')]

## Linear SVM - One vs Rest

In [20]:

from sklearn.svm import LinearSVC

classif = OneVsRestClassifier(LinearSVC())
classif.fit(X_train, y_train)
y = classif.predict(X_test)
print('accuracy_score', accuracy_score(y_test, y), '\n',
      'roc_auc_score', roc_auc_score(y_test, y), '\n',
      'hamming_loss', hamming_loss(y_test, y))

accuracy_score 0.3319884726224784 
 roc_auc_score 0.6007018964536487 
 hamming_loss 0.2404899135446686


## Chained Linear SVM

In [16]:

from sklearn.svm import LinearSVC

classif = ClassifierChain(LinearSVC())
classif.fit(X_train, y_train)
y = classif.predict(X_test)

print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, y.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.3285302593659942 
 roc_auc_score 0.6273825065533728 
 hamming_loss 0.2489913544668588


## SVM, Gaussian kernel - One vs Rest

In [4]:

from sklearn.svm import SVC

classif = OneVsRestClassifier(SVC(gamma='scale'))
classif.fit(X_train, y_train)
y = classif.predict(X_test)

print('accuracy_score', accuracy_score(y_test, y), '\n',
      'roc_auc_score', roc_auc_score(y_test, y), '\n',
      'hamming_loss', hamming_loss(y_test, y))

accuracy_score 0.2904899135446686 
 roc_auc_score 0.5665363295727185 
 hamming_loss 0.24927953890489912


## Chained SVM, Gaussian kernel

In [6]:

from sklearn.svm import SVC

classif = ClassifierChain(SVC(gamma='scale'))
classif.fit(X_train, y_train)
y = classif.predict(X_test)

print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, y.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.2881844380403458 
 roc_auc_score 0.5611140850486074 
 hamming_loss 0.2524495677233429


## MLKNN

In [7]:
from skmultilearn.adapt import MLkNN


# Note that this classifier can throw up errors when handling sparse matrices.
X_train_mlknn = lil_matrix(X_train).toarray()
y_train_mlknn = lil_matrix(y_train).toarray()
X_test_mlknn = lil_matrix(X_test).toarray()

classif = MLkNN()
classif.fit(X_train_mlknn, y_train_mlknn)


MLkNN(ignore_first_neighbours=0, k=10, s=1.0)

In [8]:
y = classif.predict(X_test_mlknn)
yp = classif.predict_proba(X_test_mlknn)
print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.2622478386167147 
 roc_auc_score 0.5658352239225045 
 hamming_loss 0.28285302593659944


## Chained GaussianNB

In [9]:

from sklearn.naive_bayes import GaussianNB
# initialize classifier chains multi-label classifier
classif = ClassifierChain(GaussianNB())
classif.fit(X_train, y_train)

ClassifierChain(classifier=GaussianNB(priors=None, var_smoothing=1e-09),
        order=None, require_dense=[True, True])

In [10]:
y = classif.predict(X_test)
yp = classif.predict_proba(X_test)
print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.20922190201729107 
 roc_auc_score 0.6912461019993441 
 hamming_loss 0.33645533141210376


## Chained BernoulliNB

In [11]:
from sklearn.naive_bayes import BernoulliNB

classif = ClassifierChain(BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True))
classif.fit(X_train, y_train)

ClassifierChain(classifier=BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True),
        order=None, require_dense=[True, True])

In [12]:
y = classif.predict(X_test)
yp = classif.predict_proba(X_test)
print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.2904899135446686 
 roc_auc_score 0.7198294564753879 
 hamming_loss 0.25994236311239194


## Chained BaggedBernoulliNB

In [13]:
from sklearn.ensemble import BaggingClassifier

classif = ClassifierChain(BaggingClassifier(base_estimator=BernoulliNB()))
classif.fit(X_train, y_train)

ClassifierChain(classifier=BaggingClassifier(base_estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
        order=None, require_dense=[True, True])

In [14]:
y = classif.predict(X_test)
yp = classif.predict_proba(X_test)
print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.2685878962536023 
 roc_auc_score 0.7004657857359968 
 hamming_loss 0.2889048991354467


## Chained Adaboost with Decision Trees

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
classif = ClassifierChain(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
classif.fit(X_train, y_train)
y = classif.predict(X_test)
yp = classif.predict_proba(X_test)
print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.1642651296829971 
 roc_auc_score 0.5412128136372868 
 hamming_loss 0.3612391930835735


## Chained LogisticRegression

In [20]:
from sklearn.linear_model import LogisticRegression

classif = ClassifierChain(LogisticRegression(solver='lbfgs'))
classif.fit(X_train, y_train)
y = classif.predict(X_test)
yp = classif.predict_proba(X_test)
print('accuracy_score', accuracy_score(y_test, y.toarray()), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp.toarray()), '\n',
      'hamming_loss', hamming_loss(y_test, y.toarray()))

accuracy_score 0.3504322766570605 
 roc_auc_score 0.7446379886968562 
 hamming_loss 0.23631123919308358


## RandomForest

In [21]:
from sklearn.ensemble import RandomForestClassifier
classif = RandomForestClassifier(n_estimators=100, class_weight=None)
classif.fit(X_train, y_train)
y = classif.predict(X_test)


In [25]:
print('accuracy_score', accuracy_score(y_test, y), '\n',
      'roc_auc_score', roc_auc_score(y_test, y), '\n',
      'hamming_loss', hamming_loss(y_test, y))

accuracy_score 0.30086455331412104 
 roc_auc_score 0.6340715137209887 
 hamming_loss 0.2695965417867435


## FCNN (MLP)

In [23]:
from sklearn.neural_network import MLPClassifier

classif = MLPClassifier()
classif.fit(X_train, y_train)





MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [24]:
y = classif.predict(X_test)
yp = classif.predict_proba(X_test)
print('accuracy_score', accuracy_score(y_test, y), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp), '\n',
      'hamming_loss', hamming_loss(y_test, y))

accuracy_score 0.30086455331412104 
 roc_auc_score 0.7124481967804894 
 hamming_loss 0.2695965417867435
