# MBTI Project

## By The Introverts 
1. Nan Lin
2. Zack Pan
3. Ben Khuong
4. Tomohiko Ishihara
5. Donya Fozoonmayeh

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

## Load Data

#### TFIDF Feature Set: 

In [34]:
# Load processed text data
mbti_processed = pd.read_csv('data/mbti_preprocessed_1.csv')
mbti_processed.tail()

Unnamed: 0,type,posts,processed_posts
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,ixfp always think cat fi doms reason esp...
8671,ENFP,'So...if this thread already exists someplace ...,thread already exists someplace else he...
8672,INTP,'So many questions when i do these things. I ...,many question thing would take purple pill ...
8673,INFP,'I am very conflicted right now when it comes ...,conflicted right come wanting child honestl...
8674,INFP,'It has been too long since I have been on per...,long since personalitycafe although seem ch...


In [35]:
# Extract TFIDF scores from mbti_processed
vectorizer = TfidfVectorizer(max_df=.8, min_df=25, max_features=1000)
word_count = vectorizer.fit_transform(mbti_processed['processed_posts'])
mbti_tfidf = pd.DataFrame(data=word_count.toarray(),
                          columns=vectorizer.get_feature_names())
mbti_tfidf.head()

Unnamed: 0,ability,able,absolutely,accept,according,accurate,across,act,action,activity,...,ya,yeah,year,yep,yes,yesterday,yet,young,younger,youtube
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101365,0.0,0.038709,0.095095,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.04482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.029307,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.138606,0.050254,0.120944,0.0,0.0,0.072526,0.0,0.0,0.0,0.0,...,0.0,0.0,0.164299,0.0,0.068313,0.0,0.0,0.0,0.0,0.0
3,0.0,0.08156,0.0,0.0,0.0,0.0,0.0,0.0,0.106206,0.0,...,0.0,0.03349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.032436,0.0,0.067431,0.0,0.0,0.0,0.0,0.0


#### Nominalizing Types:

In [36]:
# Types per category
# analyze types by sub-category may be easier
mbti_processed['EorI']= mbti_processed['type'].apply(lambda x:x[0])
mbti_processed['NorS']= mbti_processed['type'].apply(lambda x:x[1])
mbti_processed['TorF']= mbti_processed['type'].apply(lambda x:x[2])
mbti_processed['JorP']= mbti_processed['type'].apply(lambda x:x[3])

In [38]:
# A type consists of 4 capitals and each capital corresponds to 2 possible characteristics.
# For later encoding and modeling issues, let's tranfer them into separate binary code.
type_map = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
# transfer column 3-6 into binary code.
def type_preprocess(df):
    for i in range(3,7):
        df.iloc[:,i] = df.iloc[:,i].map(type_map)
    return df

In [39]:
mbti_processed = type_preprocess(mbti_processed)

In [40]:
mbti_processed.head()

Unnamed: 0,type,posts,processed_posts,EorI,NorS,TorF,JorP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter top ten play pr...,0,0,0,0
1,ENTP,'I'm finding the lack of me in these posts ver...,finding lack post alarming sex boring positi...,1,0,1,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course say know blessing...,0,0,1,1
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed conversation day esoteric ...,0,0,1,0
4,ENTJ,'You're fired.|||That's another silly misconce...,fired another silly misconception approachi...,1,0,1,0


## Fit scikit-learn model

In [46]:
from numpy import mean
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [60]:
def base_model(model, X, target, nsplits=4):
    kf = StratifiedShuffleSplit(n_splits=nsplits, random_state=420)
    
    types = {'EorI':'Extroversion vs. Introversion', 'NorS': 'Intuition vs. Sensing',
                 'TorF': 'Thinking vs. Feeling','JorP': 'Judging vs. Perceiving'}

    for col in target.columns:
        print(f"This is the training for {types[col]}:")
        y = target[col]
        all_fscores = []
        all_accuracies = []
        for train, test in kf.split(X,y):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            probs = np.ones((len(y_test), 16))
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            fscore = f1_score(y_test, preds, average='weighted')
            all_fscores.append(fscore)
            accuracy = accuracy_score(preds, y_test)
            all_accuracies.append(accuracy)
            model_name = str(model).split('(')[0]
        print(f'Average F1-score: {mean(all_fscores)}; Average Accuracy: {mean(all_accuracies)}')

In [61]:
# initialize
MNB = MultinomialNB()
target = mbti_processed.iloc[:,3:7]
X = np.array(mbti_tfidf)

## Evaluation Metric

In [62]:
base_model(MNB, X, target, nsplits=5)

This is the training for Extroversion vs. Introversion:
Average F1-score: 0.6693788402457758; Average Accuracy: 0.7695852534562212
This is the training for Intuition vs. Sensing:
Average F1-score: 0.7977597298900396; Average Accuracy: 0.8617511520737328
This is the training for Thinking vs. Feeling:
Average F1-score: 0.7314441999082126; Average Accuracy: 0.7364055299539171
This is the training for Judging vs. Perceiving:
Average F1-score: 0.4998581845798852; Average Accuracy: 0.6195852534562212
