# Capstone: MBTI

## What can be learned from this?     
    Lightning talk input:
    
    Calum clustering based on something other than type
    Hernan can you categorise the messages into topics and look at it that way.

In [23]:
import pandas as pd

import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth',1000, 'display.max_columns',1000)

In [2]:
mbti = pd.read_csv('mbti.csv')

In [32]:
print('MBTI Shape:', mbti.shape)
mbti.tail(1)

MBTI Shape: (8675, 2)


Unnamed: 0,type,posts
8674,INFP,"'It has been too long since I have been on personalitycafe - although it doesn't seem to have changed one bit - but I must say it is good to be back somewhere like this. Usually I turn to Doctor Who...|||http://www.youtube.com/watch?v=6EEW-9NDM5k|||Overwhelmed by the world around me.|||In one dream I have had I was being chased by a large shadowy creature, with someone else who I felt I had to save above all else. The dream ended after she reached safety, but as for what happened...|||Well now My Avatar is a Doctor Who Clockwork Creature. I always liked this monster because It is just a worker trying to do it's job, kind of :3|||1st - Thanks for your reply, I appreciate all the help I can get. 2nd - I think everyone has the right to their opinion :) (however too many people abuse that right :P )|||Yea, Not Doing the Iron Man Thing xD Thanks for all the advice everyone :)|||Thanks :) I think I needed some humour. I might show them this (maybe). I know they won't do anything hars..."


In [36]:
print('Baseline:',round(mbti.type.value_counts(normalize=True).max(),2))

Baseline = 0.21


In [99]:
stop_words_list = [item.lower() for item in mbti.type.unique()]+[str(item.lower())+'s' for item in mbti.type.unique()]

tvec = TfidfVectorizer(max_features=500,
                       analyzer='word',
                       stop_words=stop_words_list)
tvec.fit(mbti['posts'])

X = pd.DataFrame(tvec.transform(mbti['posts']).toarray(), columns=tvec.get_feature_names())
y = mbti['type']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=10000, n_jobs=2)
print('Cross Val Score:',cross_val_score(lr,X_train,y_train,cv=5).mean())

Cross Val Score: 0.284873529805002


In [100]:
lr.fit(X_train,y_train)
lr.classes_

array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype=object)

In [101]:
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,lr.predict(X_test)),
                   columns=lr.classes_, index=lr.classes_))
print()
print('Classification Report:')
print(classification_report(y_test,lr.predict(X_test)))

Confusion Matrix:
      ENFJ  ENFP  ENTJ  ENTP  ESFJ  ESFP  ESTJ  ESTP  INFJ  INFP  INTJ  INTP  \
ENFJ     2     2     3     0     0     0     0     0     5     5     1     1   
ENFP     1    15     0     6     0     0     1     2     6    23     1     7   
ENTJ     0     0     2     2     1     0     0     0     3     3     7     3   
ENTP     2     4     2    14     0     0     0     3     4     9    12    13   
ESFJ     0     0     0     0     0     0     0     0     2     0     1     0   
ESFP     0     0     0     0     0     0     0     1     1     3     0     0   
ESTJ     0     0     1     0     0     0     0     0     2     0     1     0   
ESTP     0     0     1     1     0     0     1     0     0     1     1     3   
INFJ     3     5     2     7     0     0     0     2    47    47     9    16   
INFP     2    11     1     3     1     1     0     1    23    98     9    20   
INTJ     2     1     6     3     1     0     1     1    15    12    30    25   
INTP     0     2     3

In [102]:
lr.coef_.shape

(16, 500)

In [103]:
pd.DataFrame(lr.coef_,columns=tvec.get_feature_names(), index=lr.classes_).T.sort_values('INFP',ascending=False)

Unnamed: 0,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
fi,-0.574158,0.219031,-0.184508,-0.152559,-0.296325,-0.237296,-0.025024,0.135871,-0.279813,0.520094,0.137440,-0.189850,-0.062488,0.441651,-0.118783,-0.508759
feel,0.075060,0.071807,-0.170197,-0.399187,0.160179,0.312620,-0.421503,-0.676867,0.113979,0.244136,-0.232461,-0.116624,0.452105,0.019634,-0.295487,-0.417228
world,-0.281747,0.049377,-0.107775,-0.130672,-0.371496,-0.223758,-0.081777,-0.580454,0.012679,0.232759,-0.024019,-0.010527,-0.481198,-0.339244,-0.787115,0.035800
youtube,-0.567556,0.523791,-0.164868,0.260487,-0.054228,-0.244924,-0.105374,0.080739,0.132351,0.215808,0.078920,-0.460397,-0.868275,0.066842,0.023113,0.035093
jpg,-0.098586,-0.181882,0.228255,0.124453,-0.030212,0.004050,0.135198,0.729764,0.042483,0.153444,-0.013116,0.057363,-0.121778,-0.212693,0.135094,-0.200011
com,0.073938,0.494957,-0.091866,0.219023,-0.060903,0.468160,0.185383,0.096444,-0.367229,0.153204,0.053644,-0.073043,-0.190895,0.467289,-0.272456,0.195924
think,-0.010007,-0.076762,-0.095864,0.022501,0.193320,0.206814,0.206095,0.118286,0.037342,0.143689,-0.035148,-0.019270,-0.016852,-0.032396,-0.464969,-0.131545
love,0.190135,0.184463,-0.235631,-0.109891,0.390251,-0.230373,0.024564,-0.528243,-0.051942,0.136734,-0.188165,-0.106622,0.193088,0.052025,-0.578522,-0.299020
te,-0.612778,0.131519,0.748337,-0.225428,-0.478599,0.491870,-0.025163,0.082614,-0.300257,0.129003,0.168610,-0.041697,-0.381506,0.031433,0.296733,-0.056291
writing,-0.304497,0.072363,-0.057765,-0.042971,-0.030943,0.147853,-0.027014,-0.340747,0.049940,0.128419,-0.070975,-0.008479,-1.040741,0.025450,0.079615,-0.310492


In [8]:
raw_test = pd.read_csv('raw_test.csv')
raw_test.shape

(49159, 1)