In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import loadtxt
from sklearn.metrics import roc_auc_score, roc_auc_score, hamming_loss, accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
%matplotlib inline

# read data
train = np.load('../data/X_train.npy')
test = np.load('../data/X_test.npy')
y_train = np.load('../data/y_train.npy')
y_test = np.load('../data/y_test.npy')

In [2]:
type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) / Sensing (S)", 
                   "FT: Feeling (F) / Thinking (T)", "JP: Judging (J) / Perceiving (P)"  ]


b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    
    return [b_Pers[l] for l in personality]

def translate_back(personality):
    # transform binary vector to mbti personality
    
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s



## NB-SVM

In [8]:
class NBSVM:
    def __init__(self):
        self.models = []
        self.X = None
        self.Y = None
        
    def _pr(self, y_i, y):
        p = self.X[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)
    
    def _get_mdl(self, y):
        r = np.log(self._pr(1,y) / self._pr(0,y))
        m = LogisticRegression(C=4.0, class_weight='balanced', dual=True, solver='liblinear')
        x_nb = self.X.multiply(r)
        return m.fit(x_nb, y), r

    def fit(self, X, Y):
        self.models = []
        self.X = X
        self.Y = Y
        
        for j in range(self.Y.shape[1]):
            m,r = self._get_mdl(self.Y[:,j])
            self.models.append((m, r))
            
        pass

    def predict_probas(self, X):
        probas = np.zeros((X.shape[0], self.Y.shape[1]))
        for i, m in enumerate(self.models):
            probas[:,i] = m[0].predict_proba(X.multiply(m[1]))[:,1]
        return probas

    def predict(self, X):
        predicts = np.zeros((X.shape[0], self.Y.shape[1]))
        for i, m in enumerate(self.models):
            predicts[:,i] = m[0].predict(X.multiply(m[1]))
        return predicts

In [30]:
vec = TfidfVectorizer(ngram_range=(1,2), 
               min_df=10, max_df=0.9, use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
X_train =  vec.fit_transform(train)
X_test =  vec.transform(test)


In [31]:
classif = NBSVM()
classif.fit(X_train, y_train)


In [32]:
y = classif.predict(X_test)
yp = classif.predict_probas(X_test)

In [33]:
print('accuracy_score', accuracy_score(y_test, y), '\n',
      'roc_auc_score', roc_auc_score(y_test, yp), '\n',
      'hamming_loss', hamming_loss(y_test, y))

accuracy_score 0.37060518731988473 
 roc_auc_score 0.7651928012590137 
 hamming_loss 0.22146974063400576
