In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.1)
data.columns = ['type', 'posts']

In [3]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
6051,ENFP,"'Hehe, yeah I had some of that as I read my po...",125.06
3045,ISTP,mooni wut y r u here m8 Edit: okay I entirely...,77.76
2985,INFJ,'This is so cute! :kitteh: I'd probably th...,133.30
5445,INFP,right now I have to pee but I won't go release...,106.30
905,INFP,"'Okay, you all were right as in I can't contro...",143.00
...,...,...,...
3823,INTP,'*Spoiler alert* I had no problem with Todd be...,118.68
6532,INFP,'My baby is a BMW M3 Convertible. Itx92s aweso...,138.78
4715,INFJ,'Interacting and communicating with other peop...,151.60
3860,INFP,"'I think Joffrey is I not E, SFJ seems right t...",95.52


In [4]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
6051,ENFP,"'Hehe, yeah I had some of that as I read my po...",125.06,6253
3045,ISTP,mooni wut y r u here m8 Edit: okay I entirely...,77.76,3888
2985,INFJ,'This is so cute! :kitteh: I'd probably th...,133.30,6665
5445,INFP,right now I have to pee but I won't go release...,106.30,5315
905,INFP,"'Okay, you all were right as in I can't contro...",143.00,7150
...,...,...,...,...
3823,INTP,'*Spoiler alert* I had no problem with Todd be...,118.68,5934
6532,INFP,'My baby is a BMW M3 Convertible. Itx92s aweso...,138.78,6939
4715,INFJ,'Interacting and communicating with other peop...,151.60,7580
3860,INFP,"'I think Joffrey is I not E, SFJ seems right t...",95.52,4776


In [5]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
6051,ENFP,"'Hehe, yeah I had some of that as I read my po...",125.06,6253,10.9
3045,ISTP,mooni wut y r u here m8 Edit: okay I entirely...,77.76,3888,9.4
2985,INFJ,'This is so cute! :kitteh: I'd probably th...,133.30,6665,8.2
5445,INFP,right now I have to pee but I won't go release...,106.30,5315,10.4
905,INFP,"'Okay, you all were right as in I can't contro...",143.00,7150,6.5
...,...,...,...,...,...
3823,INTP,'*Spoiler alert* I had no problem with Todd be...,118.68,5934,7.7
6532,INFP,'My baby is a BMW M3 Convertible. Itx92s aweso...,138.78,6939,6.5
4715,INFJ,'Interacting and communicating with other peop...,151.60,7580,6.4
3860,INFP,"'I think Joffrey is I not E, SFJ seems right t...",95.52,4776,6.8


In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ENFP,"'Hehe, yeah I had some of that as I read my po...",125.06,6253,10.9
1,ISTP,mooni wut y r u here m8 Edit: okay I entirely...,77.76,3888,9.4
2,INFJ,'This is so cute! :kitteh: I'd probably th...,133.30,6665,8.2
3,INFP,right now I have to pee but I won't go release...,106.30,5315,10.4
4,INFP,"'Okay, you all were right as in I can't contro...",143.00,7150,6.5
...,...,...,...,...,...
863,INTP,'*Spoiler alert* I had no problem with Todd be...,118.68,5934,7.7
864,INFP,'My baby is a BMW M3 Convertible. Itx92s aweso...,138.78,6939,6.5
865,INFJ,'Interacting and communicating with other peop...,151.60,7580,6.4
866,INFP,"'I think Joffrey is I not E, SFJ seems right t...",95.52,4776,6.8


In [7]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [lm.lemmatize(word) for word in tokens if word not in stopwords]
    return post

#data['posts'] = data['posts'].apply(lambda x: clean_posts(x))

In [8]:
data.head()

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ENFP,"'Hehe, yeah I had some of that as I read my po...",125.06,6253,10.9
1,ISTP,mooni wut y r u here m8 Edit: okay I entirely...,77.76,3888,9.4
2,INFJ,'This is so cute! :kitteh: I'd probably th...,133.3,6665,8.2
3,INFP,right now I have to pee but I won't go release...,106.3,5315,10.4
4,INFP,"'Okay, you all were right as in I can't contro...",143.0,7150,6.5


In [9]:
from sklearn.model_selection import train_test_split
X = data['posts'].values
y = data['type'].values

X_train, X_test, y_train, y_test = train_test_split(X, y) 


In [10]:
X

array(["'Hehe, yeah I had some of that as I read my post now. I can't even remember having written half of that. Pretty cool. :laughing:|||I love you! said by the woman I love, of course! Yet to happen xD|||Just to answer real short. YES, my thougthprocess is random. Made it real tough to study, bleh!|||Well sometimes when I'm really tired it can be kinda like that... like right now! I haven't slept for ages (well in fact I have slept, quite long and well, but not in almost 18 hours, somehow today I...|||Surprisingly enough, most of them are common swedish words... (I'm Norwegian.)  Used 29 times: På Used 28 times: Och Used 21 times: Bored Used 19 times: Det Used 19 times: Jag Used 17 times:...|||Without realizing it maybe you just hit the nail right on the head! Thats pretty much what I feel too (except I hope for the afterlife, a better one). Thanks for giving a little more enlightening...|||I was bullied from... the start and all up to my 11th year at school. I haven't been as badly

In [11]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
        ('vect',CountVectorizer(analyzer = clean_posts)),
        ('clf', RandomForestClassifier())
        
])

pipe_parms = [{
    'clf__n_estimators' : [600,800],
    'clf__max_depth' : [None,200]
}]

pipe.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function clean_posts at 0x000001DA26262B70>)),
                ('clf', RandomForestClassifier())])

In [12]:
# Predict training data
y_train_pred = pipe.predict(X_train)
print(f"Predictions on training data: {y_train_pred}")


Predictions on training data: ['INFP' 'ENTP' 'ENTP' 'INFJ' 'INFJ' 'ESFP' 'INFP' 'INTP' 'INTP' 'INFJ'
 'ISTJ' 'ENTP' 'ENFP' 'INFP' 'ISFP' 'ISTP' 'INTP' 'INFP' 'ISFP' 'INFJ'
 'ENFP' 'ISFP' 'ISTJ' 'INTJ' 'INTJ' 'INTP' 'INTP' 'INTJ' 'INTP' 'ENFP'
 'INTP' 'ISTP' 'ENTP' 'ENTP' 'INFJ' 'INTP' 'ENTJ' 'ENFP' 'ENFP' 'INFP'
 'ISFP' 'INFP' 'ENTJ' 'ISTJ' 'INTJ' 'INFJ' 'ENFP' 'ENTP' 'ISTP' 'INFJ'
 'INFJ' 'INFJ' 'INFJ' 'INFJ' 'INTJ' 'INFJ' 'INFP' 'INFJ' 'ENTP' 'ENFP'
 'INTJ' 'INFP' 'ISFP' 'ENTP' 'INTP' 'ISFP' 'INFJ' 'ENFP' 'ISFJ' 'INTP'
 'ENFP' 'ESTP' 'ISTP' 'INFP' 'ENTP' 'INTJ' 'ENFP' 'INFP' 'INFJ' 'INTP'
 'INFJ' 'INTP' 'ISTP' 'ISFP' 'INTP' 'INFJ' 'ISTP' 'ISFP' 'ENTJ' 'ENTP'
 'INTJ' 'INTP' 'INFP' 'ISTP' 'ISFP' 'INFJ' 'ENFP' 'ENFP' 'ISFP' 'INFJ'
 'ISFJ' 'INFP' 'INTJ' 'INTP' 'INTJ' 'ESFP' 'INFP' 'INFP' 'ENTJ' 'INFP'
 'INTP' 'ENTP' 'INFJ' 'INFJ' 'INTJ' 'INFJ' 'INTJ' 'INTP' 'ISTP' 'ISTJ'
 'INTP' 'INFP' 'INTP' 'ENTJ' 'INTJ' 'INFP' 'ENTJ' 'INFP' 'INFP' 'INFP'
 'INFJ' 'INTP' 'ESTJ' 'INTJ' 'ENTP' 'INTP' 'INT

In [13]:
# Predict test data
y_test_pred = pipe.predict(X_test)
print(f"Predictions on test data: {y_test_pred}")

Predictions on test data: ['INFJ' 'INFJ' 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP' 'ENTP' 'INFP' 'INTJ'
 'INTP' 'INFJ' 'INFP' 'INTP' 'INFP' 'INFJ' 'INTP' 'INFP' 'INFP' 'INFJ'
 'INFJ' 'INFP' 'INFP' 'INTJ' 'INFP' 'INTJ' 'INFJ' 'INFP' 'INTP' 'INFP'
 'INTJ' 'INTP' 'INFP' 'INFP' 'INFP' 'INTJ' 'INTP' 'INTJ' 'INFJ' 'INFJ'
 'INTP' 'INFP' 'INFJ' 'INFP' 'INFJ' 'INFJ' 'INFJ' 'INFP' 'INFP' 'INFP'
 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INTJ' 'INTP'
 'INTP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP'
 'INFJ' 'INFJ' 'INFJ' 'INFJ' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFJ' 'INTP'
 'INFP' 'INFJ' 'INFJ' 'INTP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP'
 'INFP' 'ENTP' 'INFP' 'INFP' 'INTP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ'
 'INFP' 'INFJ' 'INFP' 'INFP' 'ENTP' 'INFP' 'INFP' 'INFP' 'INTP' 'INFP'
 'INFP' 'INFP' 'INTP' 'ENFP' 'INTP' 'INTP' 'INFP' 'INFP' 'INFJ' 'INFP'
 'INFJ' 'INFP' 'INFJ' 'INFJ' 'INFJ' 'INFP' 'INFP' 'ISTP' 'INFP' 'INFP'
 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' '

In [14]:
pipe.score(X_test, y_test)

0.35944700460829493

In [None]:
gs = GridSearchCV(pipe, param_grid= pipe_parms, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_params_)

In [None]:
print(gs.best_score_)

In [15]:
import pickle


In [17]:
pickle.dumps(clean_posts)

b'\x80\x03c__main__\nclean_posts\nq\x00.'

In [18]:
# yo uname the file here
with open('mbti_model.pickle', 'wb') as f:
     pickle.dump(pipe, f)

In [None]:
# # yo uname the file here
# with open('mbti_model.pickle', 'wb') as f:
#     pickle.dump(pipe, f)

In [None]:
# with open('picklefile.pickle', 'rb') as f:
#     loaded_vars = pickle.load(f)

In [None]:
# loaded_vars

In [None]:
# gs.predict(X_count_feature)

In [None]:
# pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()