In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.3)
data.columns = ['type', 'posts']

In [3]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
2620,ENTP,"'Oh yes, it's so fantastically true it's wonde...",141.54
4045,INFP,'I do too! I think this one will be touching ...,119.88
2616,INFP,'Like a rollercoaster. My best female friend w...,124.70
3877,INTJ,'Indiana Solo - this is a forum that is the be...,140.58
1130,INFJ,'Do you think she was infj?|||When i was like ...,112.78
...,...,...,...
5725,ENTP,'716017 This is what it came up with. I am lea...,125.16
6467,INTP,"'Actually, I was referring to these posts:|||H...",143.76
1993,INFP,'I had a job at a family owned fast paced rest...,111.18
669,INTP,"'Oh I'd love to, but as I don't live there she...",134.22


In [4]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
2620,ENTP,"'Oh yes, it's so fantastically true it's wonde...",141.54,7077
4045,INFP,'I do too! I think this one will be touching ...,119.88,5994
2616,INFP,'Like a rollercoaster. My best female friend w...,124.70,6235
3877,INTJ,'Indiana Solo - this is a forum that is the be...,140.58,7029
1130,INFJ,'Do you think she was infj?|||When i was like ...,112.78,5639
...,...,...,...,...
5725,ENTP,'716017 This is what it came up with. I am lea...,125.16,6258
6467,INTP,"'Actually, I was referring to these posts:|||H...",143.76,7188
1993,INFP,'I had a job at a family owned fast paced rest...,111.18,5559
669,INTP,"'Oh I'd love to, but as I don't live there she...",134.22,6711


In [5]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
2620,ENTP,"'Oh yes, it's so fantastically true it's wonde...",141.54,7077,7.1
4045,INFP,'I do too! I think this one will be touching ...,119.88,5994,9.1
2616,INFP,'Like a rollercoaster. My best female friend w...,124.70,6235,8.2
3877,INTJ,'Indiana Solo - this is a forum that is the be...,140.58,7029,8.2
1130,INFJ,'Do you think she was infj?|||When i was like ...,112.78,5639,6.7
...,...,...,...,...,...
5725,ENTP,'716017 This is what it came up with. I am lea...,125.16,6258,7.1
6467,INTP,"'Actually, I was referring to these posts:|||H...",143.76,7188,8.0
1993,INFP,'I had a job at a family owned fast paced rest...,111.18,5559,7.2
669,INTP,"'Oh I'd love to, but as I don't live there she...",134.22,6711,6.6


In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ENTP,"'Oh yes, it's so fantastically true it's wonde...",141.54,7077,7.1
1,INFP,'I do too! I think this one will be touching ...,119.88,5994,9.1
2,INFP,'Like a rollercoaster. My best female friend w...,124.70,6235,8.2
3,INTJ,'Indiana Solo - this is a forum that is the be...,140.58,7029,8.2
4,INFJ,'Do you think she was infj?|||When i was like ...,112.78,5639,6.7
...,...,...,...,...,...
2597,ENTP,'716017 This is what it came up with. I am lea...,125.16,6258,7.1
2598,INTP,"'Actually, I was referring to these posts:|||H...",143.76,7188,8.0
2599,INFP,'I had a job at a family owned fast paced rest...,111.18,5559,7.2
2600,INTP,"'Oh I'd love to, but as I don't live there she...",134.22,6711,6.6


In [8]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post

In [9]:
from sklearn.model_selection import train_test_split
X = data['posts'].values
y = data['type'].values

X_train, X_test, y_train, y_test = train_test_split(X, y) 


In [20]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
        ('vect',CountVectorizer(analyzer = clean_posts)),
        ('clf', RandomForestClassifier())
        
])

pipe_parms = [{
    'clf__n_estimators' : [800],
    'clf__max_depth' : [None]
}]

pipe.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function clean_posts at 0x0000028F4C1B3AE8>)),
                ('clf', RandomForestClassifier())])

In [21]:
# Predict training data
y_train_pred = pipe.predict(X_train)
print(f"Predictions on training data: {y_train_pred}")


Predictions on training data: ['INFJ' 'ENFJ' 'ENTP' ... 'ENFP' 'INFJ' 'ENTJ']


In [22]:
# Predict test data
y_test_pred = pipe.predict(X_test)
print(f"Predictions on test data: {y_test_pred}")

Predictions on test data: ['INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP' 'INTP' 'INFP' 'INFJ'
 'INFP' 'INFP' 'INFP' 'INFP' 'INTP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP'
 'INFP' 'INFP' 'INTJ' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFJ'
 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INTJ' 'INFP' 'INFP'
 'INFP' 'INTP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INTJ' 'INFP'
 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INTP' 'INTP' 'INTJ'
 'INFP' 'INTP' 'INTP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP'
 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP'
 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INTP' 'INFP'
 'INFP' 'INTP' 'INFP' 'INTP' 'INFP' 'INFP' 'INTP' 'INFP' 'INFP' 'ENTP'
 'INFP' 'INFP' 'INTP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFJ' 'INFJ' 'INFP'
 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP' 'INFP' 'INTP' 'INFJ'
 'INFP' 'INFP' 'INFJ' 'INTP' 'INFP' 'INTJ' 'INFJ' 'INTP' 'INFP' 'INFP'
 'INFJ' 'ENTP' 'INFP' 'INFP' 'INFJ' 'INFJ' 'INFP' '

In [23]:
pipe.score(X_test, y_test)

0.38095238095238093

In [24]:
gs = GridSearchCV(pipe, param_grid= pipe_parms, cv=5)

In [25]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(analyzer=<function clean_posts at 0x0000028F4C1B3AE8>)),
                                       ('clf', RandomForestClassifier())]),
             param_grid=[{'clf__max_depth': [None],
                          'clf__n_estimators': [800]}])

In [27]:
print(gs.best_params_)

{'clf__max_depth': None, 'clf__n_estimators': 800}


In [29]:
print(gs.best_score_)

0.3505895468555315


In [30]:
import pickle
import joblib

In [31]:
# yo uname the file here
with open('mbti_model.pickle', 'wb') as f:
    pickle.dump(pipe, f)

In [None]:
with open('picklefile.pickle', 'rb') as f:
    loaded_vars = pickle.load(f)

In [None]:
loaded_vars

In [None]:
gs.predict(X_count_feature)

In [None]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()