In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.4)
data.columns = ['type', 'posts']

In [3]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
7605,ENTP,'I'm an ENTP I guess if you really buy that th...,130.00
8515,INFJ,'How do you deal with Dr doorslam that are ove...,128.08
5301,INTP,'The Robot is a Reaper; a sentient machine bei...,146.36
3801,ENFP,'Yeah. Usually I think of it as pretty close b...,162.72
5012,ENFP,"'Yes, indeed. Bi with hella gay leaning now. O...",135.46
...,...,...,...
8354,INTP,'If you're worried about coming across as a so...,144.74
8054,ENFP,'That requires getting to a point that you can...,112.66
1627,ENFP,'I'm super excited to finish this sociology pa...,138.10
6664,INFP,'https://www.youtube.com/watch?v=nKHL9Sr-OS8||...,116.64


In [4]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
7605,ENTP,'I'm an ENTP I guess if you really buy that th...,130.00,6500
8515,INFJ,'How do you deal with Dr doorslam that are ove...,128.08,6404
5301,INTP,'The Robot is a Reaper; a sentient machine bei...,146.36,7318
3801,ENFP,'Yeah. Usually I think of it as pretty close b...,162.72,8136
5012,ENFP,"'Yes, indeed. Bi with hella gay leaning now. O...",135.46,6773
...,...,...,...,...
8354,INTP,'If you're worried about coming across as a so...,144.74,7237
8054,ENFP,'That requires getting to a point that you can...,112.66,5633
1627,ENFP,'I'm super excited to finish this sociology pa...,138.10,6905
6664,INFP,'https://www.youtube.com/watch?v=nKHL9Sr-OS8||...,116.64,5832


In [5]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
7605,ENTP,'I'm an ENTP I guess if you really buy that th...,130.00,6500,8.8
8515,INFJ,'How do you deal with Dr doorslam that are ove...,128.08,6404,8.9
5301,INTP,'The Robot is a Reaper; a sentient machine bei...,146.36,7318,6.5
3801,ENFP,'Yeah. Usually I think of it as pretty close b...,162.72,8136,9.1
5012,ENFP,"'Yes, indeed. Bi with hella gay leaning now. O...",135.46,6773,9.2
...,...,...,...,...,...
8354,INTP,'If you're worried about coming across as a so...,144.74,7237,7.4
8054,ENFP,'That requires getting to a point that you can...,112.66,5633,8.5
1627,ENFP,'I'm super excited to finish this sociology pa...,138.10,6905,7.8
6664,INFP,'https://www.youtube.com/watch?v=nKHL9Sr-OS8||...,116.64,5832,8.4


In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ENTP,'I'm an ENTP I guess if you really buy that th...,130.00,6500,8.8
1,INFJ,'How do you deal with Dr doorslam that are ove...,128.08,6404,8.9
2,INTP,'The Robot is a Reaper; a sentient machine bei...,146.36,7318,6.5
3,ENFP,'Yeah. Usually I think of it as pretty close b...,162.72,8136,9.1
4,ENFP,"'Yes, indeed. Bi with hella gay leaning now. O...",135.46,6773,9.2
...,...,...,...,...,...
3465,INTP,'If you're worried about coming across as a so...,144.74,7237,7.4
3466,ENFP,'That requires getting to a point that you can...,112.66,5633,8.5
3467,ENFP,'I'm super excited to finish this sociology pa...,138.10,6905,7.8
3468,INFP,'https://www.youtube.com/watch?v=nKHL9Sr-OS8||...,116.64,5832,8.4


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
stopwords =nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#nltk.download()

In [9]:
import gc

In [10]:
gc.collect()

0

In [11]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [12]:
# create function to clean the posts
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post

#TF-IDF
tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
X_tf = tf_vectorize.fit_transform(data['posts'])
X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(data['posts'])
X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feature.head()

Unnamed: 0,avg_post_len,punct_%,0,1,2,3,4,5,6,7,...,154154,154155,154156,154157,154158,154159,154160,154161,154162,154163
0,130.0,8.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128.08,8.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,146.36,6.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,162.72,9.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135.46,9.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
target = data['type']

In [14]:
target

0       ENTP
1       INFJ
2       INTP
3       ENFP
4       ENFP
        ... 
3465    INTP
3466    ENFP
3467    ENFP
3468    INFP
3469    INFP
Name: type, Length: 3470, dtype: object

In [15]:
X_count_feature

Unnamed: 0,avg_post_len,punct_%,0,1,2,3,4,5,6,7,...,154154,154155,154156,154157,154158,154159,154160,154161,154162,154163
0,130.00,8.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128.08,8.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,146.36,6.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,162.72,9.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135.46,9.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465,144.74,7.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3466,112.66,8.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3467,138.10,7.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3468,116.64,8.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
rf=RandomForestClassifier()
param={'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,318.609673,66.653728,5.107216,1.424707,,300,"{'max_depth': None, 'n_estimators': 300}",0.368876,0.354467,0.334294,0.34438,0.347262,0.349856,0.011506,1
10,225.725562,32.430865,8.937274,4.588283,,150,"{'max_depth': None, 'n_estimators': 150}",0.357349,0.329971,0.350144,0.347262,0.332853,0.343516,0.010454,2
8,356.410045,38.321753,8.783935,3.004685,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.367435,0.348703,0.329971,0.331412,0.335735,0.342651,0.014041,3
5,316.115182,14.632479,11.463396,3.354687,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.355908,0.357349,0.340058,0.321326,0.334294,0.341787,0.01356,4
4,220.323065,20.245018,8.200329,4.244974,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.338617,0.332853,0.299712,0.347262,0.332853,0.330259,0.016159,5


In [None]:
rf=RandomForestClassifier()
param={'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

gs_tf=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_tf_fit=gs_tf.fit(X_tf_feature,data['type'])
pd.DataFrame(gs_tf_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

In [None]:
import joblib

In [None]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

In [None]:
filename = 'count_vect_model.sav'
joblib.dump(gs_fit, filename)

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X_count_feature, target, random_state=63)

In [None]:
#rf = RandomForestClassifier(n_estimators=200)
#rf = rf.fit(X_train, y_train)
#rf.score(X_test, y_test)

In [None]:
#X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf_feature, target, random_state=42)

In [None]:
#rf_tf = RandomForestClassifier(n_estimators=200)
#rf_tf = rf_tf.fit(X_train, y_train)
#rf_tf.score(X_test, y_test)