In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords 

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from bs4 import BeautifulSoup

from sklearn import metrics

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

py.init_notebook_mode(connected=True)

In [4]:
stemmed = pd.read_csv('stemmed.csv')

In [49]:
len(stemmed)

8675

In [77]:
text = stemmed['clean_posts']
code = stemmed['encode']
fixed_text = text[pd.notnull(text)]
fixed_code = code[pd.notnull(text)]
data = pd.DataFrame({'text': fixed_text, 'code': fixed_code})
len(data)

8651

In [78]:
data.to_csv("final.csv", encoding='utf-8', index=False)

## Try Naive Bayes without cross validation 

In [57]:
train, test =  train_test_split(data, test_size=0.2, random_state=14)

In [58]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.text)
X_train_counts.shape

(6920, 92302)

In [73]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6920, 92302)

In [74]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train.code)

In [75]:
train_predicted = clf.predict(X_train_tfidf)
np.mean(train_predicted == train.code) 

0.21343930635838151

In [76]:
X_test_counts = count_vect.transform(test.text)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
test_predicted = clf.predict(X_test_tfidf)
np.mean(test_predicted == test.code)  

0.21143847487001732

## Naive Bayes with Cross Validation

In [5]:
data = pd.read_csv('final.csv')

In [6]:
data.head()

Unnamed: 0,code,text
0,8,'<URL> <URL> enfp and intj moments <URL> spo...
1,3,'I'm finding the lack of me in these posts ver...
2,11,"'Good one _____ <URL> Of course, to which I..."
3,10,"'Dear INTP, I enjoyed our conversation the o..."
4,2,'You're fired. That's another silly misconcept...


In [7]:
# train test split
X_train, X_test, y_train, y_test= train_test_split(data.text, data.code, test_size=0.2, random_state=14)

In [8]:
np.random.seed(1)
scoring = {'acc': 'accuracy',
           'neg_log_loss': 'neg_log_loss',
           'f1_micro': 'f1_micro'}
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
tfidf2 = CountVectorizer(ngram_range=(1, 1), 
                         stop_words='english',
                         lowercase = True, 
                         max_features = 5000)
model_nb = Pipeline([('tfidf1', tfidf2), ('nb', MultinomialNB())])
results_nb = cross_validate(model_nb, X_train, y_train, cv=kfolds, scoring=scoring, n_jobs=-1)

In [9]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_nb['test_acc']),
                                                          np.std(results_nb['test_acc'])))
print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_nb['test_f1_micro']),
                                                          np.std(results_nb['test_f1_micro'])))
print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_nb['test_neg_log_loss']),
                                                          np.std(-1*results_nb['test_neg_log_loss'])))

CV Accuracy: 0.5424 (+/- 0.0073)
CV F1: 0.5424 (+/- 0.0073)
CV Logloss: 6.3304 (+/- 0.1587)


In [10]:
model_nb.fit(X_train, y_train)  
test_predicted = model_nb.predict(X_test)

In [101]:
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']

In [102]:
print(metrics.classification_report(y_test, test_predicted, target_names=unique_type_list))

             precision    recall  f1-score   support

       INFJ       0.55      0.26      0.35        47
       ENTP       0.42      0.52      0.46       130
       INTP       0.46      0.49      0.47        45
       INTJ       0.49      0.49      0.49       169
       ENTJ       0.33      0.17      0.22         6
       ENFJ       0.00      0.00      0.00        11
       INFP       0.00      0.00      0.00         9
       ENFP       0.29      0.08      0.12        26
       ISFP       0.60      0.51      0.55       276
       ISTP       0.58      0.69      0.63       367
       ISFJ       0.58      0.57      0.57       223
       ISTJ       0.61      0.66      0.63       249
       ESTP       0.61      0.45      0.52        31
       ESFP       0.22      0.21      0.21        43
       ESTJ       0.67      0.38      0.48        42
       ESFJ       0.43      0.51      0.46        57

avg / total       0.54      0.54      0.54      1731



## Logistic Regression with Cross Validation

In [11]:
model_lr = Pipeline([('tfidf1', tfidf2), ('lr', LogisticRegression(class_weight="balanced", C=0.005))])
results_lr = cross_validate(model_lr, X_train, y_train, cv=kfolds, scoring=scoring, n_jobs=-1)

In [12]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_acc']),
                                                          np.std(results_lr['test_acc'])))
print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_f1_micro']),
                                                          np.std(results_lr['test_f1_micro'])))
print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_lr['test_neg_log_loss']),
                                                          np.std(-1*results_lr['test_neg_log_loss'])))

CV Accuracy: 0.6568 (+/- 0.0036)
CV F1: 0.6568 (+/- 0.0036)
CV Logloss: 1.2993 (+/- 0.0109)


In [15]:
model_lr.fit(X_train, y_train)  
test_predicted_lr = model_lr.predict(X_test)

In [109]:
print(metrics.classification_report(y_test, test_predicted_lr, target_names=unique_type_list))

             precision    recall  f1-score   support

       INFJ       0.51      0.40      0.45        47
       ENTP       0.60      0.61      0.61       130
       INTP       0.42      0.56      0.48        45
       INTJ       0.68      0.62      0.65       169
       ENTJ       0.22      0.33      0.27         6
       ENFJ       0.08      0.09      0.09        11
       INFP       0.67      0.44      0.53         9
       ENFP       0.67      0.38      0.49        26
       ISFP       0.69      0.61      0.65       276
       ISTP       0.73      0.75      0.74       367
       ISFJ       0.68      0.67      0.68       223
       ISTJ       0.66      0.70      0.68       249
       ESTP       0.46      0.61      0.53        31
       ESFP       0.42      0.44      0.43        43
       ESTJ       0.63      0.62      0.63        42
       ESFJ       0.49      0.65      0.56        57

avg / total       0.65      0.64      0.64      1731



## Tree based methods
http://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeClassifier.html

In [16]:
etc = ExtraTreesClassifier(n_estimators = 20, max_depth=4, n_jobs = -1)
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
tsvd = TruncatedSVD(n_components=10)
model_etc = Pipeline([('tfidf1', tfidf), ('tsvd1', tsvd), ('etc', etc)])

In [None]:
results_etc = cross_validate(model_etc, X_train, y_train, cv=kfolds, scoring=scoring, n_jobs=-1)

In [None]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_etc['test_acc']),
                                                          np.std(results_etc['test_acc'])))
print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_f1_micro']),
                                                          np.std(results_etc['test_f1_micro'])))
print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_lr['test_neg_log_loss']),
                                                          np.std(-1*results_etc['test_neg_log_loss'])))

In [112]:
model_etc.fit(X_train, y_train)  
test_predicted_etc = model_etc.predict(X_test)

In [114]:
print(metrics.classification_report(y_test, test_predicted_etc, target_names=unique_type_list))

             precision    recall  f1-score   support

       INFJ       0.00      0.00      0.00        47
       ENTP       0.00      0.00      0.00       130
       INTP       0.00      0.00      0.00        45
       INTJ       0.00      0.00      0.00       169
       ENTJ       0.00      0.00      0.00         6
       ENFJ       0.00      0.00      0.00        11
       INFP       0.00      0.00      0.00         9
       ENFP       0.00      0.00      0.00        26
       ISFP       0.48      0.20      0.28       276
       ISTP       0.27      0.92      0.42       367
       ISFJ       1.00      0.02      0.04       223
       ISTJ       0.37      0.55      0.44       249
       ESTP       0.00      0.00      0.00        31
       ESFP       0.00      0.00      0.00        43
       ESTJ       0.00      0.00      0.00        42
       ESFJ       0.00      0.00      0.00        57

avg / total       0.32      0.31      0.20      1731




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.

