In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.6)
data.columns = ['type', 'posts']

In [3]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
736,ISTP,Could I have my name changed to Barkhouse? Tha...,20.14
5045,ENFJ,yeah its kinda odd but i like polite people|||...,56.66
5737,INFJ,"'I'm a 4w5.|||Many I feel are mistyped, I have...",143.00
3488,INTP,"'15/50, INTP. Definitely not autistic, and ra...",122.14
5927,INFJ,'Thanks for the link! I appreciated that artic...,144.94
...,...,...,...
5677,INFJ,"'That is true, although women often have bette...",147.66
63,INFP,"'That's an opinion, not a fact. That being sa...",145.66
3203,ESTP,'well fuck me|||lol i don't know if i likes or...,83.92
4879,ISTJ,'I was thinking of answering that I'm not very...,135.44


In [4]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
736,ISTP,Could I have my name changed to Barkhouse? Tha...,20.14,1007
5045,ENFJ,yeah its kinda odd but i like polite people|||...,56.66,2833
5737,INFJ,"'I'm a 4w5.|||Many I feel are mistyped, I have...",143.00,7150
3488,INTP,"'15/50, INTP. Definitely not autistic, and ra...",122.14,6107
5927,INFJ,'Thanks for the link! I appreciated that artic...,144.94,7247
...,...,...,...,...
5677,INFJ,"'That is true, although women often have bette...",147.66,7383
63,INFP,"'That's an opinion, not a fact. That being sa...",145.66,7283
3203,ESTP,'well fuck me|||lol i don't know if i likes or...,83.92,4196
4879,ISTJ,'I was thinking of answering that I'm not very...,135.44,6772


In [5]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
736,ISTP,Could I have my name changed to Barkhouse? Tha...,20.14,1007,6.7
5045,ENFJ,yeah its kinda odd but i like polite people|||...,56.66,2833,5.8
5737,INFJ,"'I'm a 4w5.|||Many I feel are mistyped, I have...",143.00,7150,8.4
3488,INTP,"'15/50, INTP. Definitely not autistic, and ra...",122.14,6107,8.6
5927,INFJ,'Thanks for the link! I appreciated that artic...,144.94,7247,8.1
...,...,...,...,...,...
5677,INFJ,"'That is true, although women often have bette...",147.66,7383,7.0
63,INFP,"'That's an opinion, not a fact. That being sa...",145.66,7283,7.6
3203,ESTP,'well fuck me|||lol i don't know if i likes or...,83.92,4196,10.9
4879,ISTJ,'I was thinking of answering that I'm not very...,135.44,6772,7.4


In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ISTP,Could I have my name changed to Barkhouse? Tha...,20.14,1007,6.7
1,ENFJ,yeah its kinda odd but i like polite people|||...,56.66,2833,5.8
2,INFJ,"'I'm a 4w5.|||Many I feel are mistyped, I have...",143.00,7150,8.4
3,INTP,"'15/50, INTP. Definitely not autistic, and ra...",122.14,6107,8.6
4,INFJ,'Thanks for the link! I appreciated that artic...,144.94,7247,8.1
...,...,...,...,...,...
5200,INFJ,"'That is true, although women often have bette...",147.66,7383,7.0
5201,INFP,"'That's an opinion, not a fact. That being sa...",145.66,7283,7.6
5202,ESTP,'well fuck me|||lol i don't know if i likes or...,83.92,4196,10.9
5203,ISTJ,'I was thinking of answering that I'm not very...,135.44,6772,7.4


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
stopwords =nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
#nltk.download()

In [10]:
import gc

In [11]:
gc.collect()

0

In [12]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [13]:
# create function to clean the posts
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post

#TF-IDF
tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
X_tf = tf_vectorize.fit_transform(data['posts'])
X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(data['posts'])
X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feature.head()

Unnamed: 0,avg_post_len,punct_%,0,1,2,3,4,5,6,7,...,210938,210939,210940,210941,210942,210943,210944,210945,210946,210947
0,20.14,6.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,56.66,5.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,143.0,8.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,122.14,8.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,144.94,8.1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
target = data['type']

In [15]:
target

0       ISTP
1       ENFJ
2       INFJ
3       INTP
4       INFJ
        ... 
5200    INFJ
5201    INFP
5202    ESTP
5203    ISTJ
5204    INTP
Name: type, Length: 5205, dtype: object

In [16]:
X_count_feature

Unnamed: 0,avg_post_len,punct_%,0,1,2,3,4,5,6,7,...,210938,210939,210940,210941,210942,210943,210944,210945,210946,210947
0,20.14,6.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,56.66,5.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,143.00,8.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,122.14,8.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,144.94,8.1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,147.66,7.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5201,145.66,7.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5202,83.92,10.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5203,135.44,7.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
rf=RandomForestClassifier()
param={'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,544.703802,22.761871,30.343291,6.327303,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.350624,0.351585,0.3439,0.348703,0.349664,0.348895,0.002676,1
8,687.187,56.548949,19.709195,9.027375,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.338136,0.344861,0.337176,0.349664,0.352546,0.344476,0.006094,2
4,520.056694,47.229474,15.797149,8.213231,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.345821,0.344861,0.342939,0.341979,0.345821,0.344284,0.001561,3
5,857.553702,178.639541,13.826517,3.077454,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.335255,0.3439,0.331412,0.344861,0.341018,0.339289,0.00517,4
10,2463.3969,966.287217,20.369751,7.691792,,150,"{'max_depth': None, 'n_estimators': 150}",0.323727,0.319885,0.350624,0.336215,0.362152,0.338521,0.015977,5


In [20]:
# rf=RandomForestClassifier()
# param={'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

# gs_tf=GridSearchCV(rf,param,cv=5,n_jobs=-1)
# gs_tf_fit=gs_tf.fit(X_tf_feature,data['type'])
# pd.DataFrame(gs_tf_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

KeyboardInterrupt: 

In [21]:
import joblib

In [24]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,544.703802,22.761871,30.343291,6.327303,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.350624,0.351585,0.3439,0.348703,0.349664,0.348895,0.002676,1
8,687.187,56.548949,19.709195,9.027375,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.338136,0.344861,0.337176,0.349664,0.352546,0.344476,0.006094,2
4,520.056694,47.229474,15.797149,8.213231,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.345821,0.344861,0.342939,0.341979,0.345821,0.344284,0.001561,3
5,857.553702,178.639541,13.826517,3.077454,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.335255,0.3439,0.331412,0.344861,0.341018,0.339289,0.00517,4
10,2463.3969,966.287217,20.369751,7.691792,,150,"{'max_depth': None, 'n_estimators': 150}",0.323727,0.319885,0.350624,0.336215,0.362152,0.338521,0.015977,5


In [None]:
# filename = 'count_vect_model.sav'
# joblib.dump({'model':gs_fit})

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X_count_feature, target, random_state=63)

In [None]:
#rf = RandomForestClassifier(n_estimators=200)
#rf = rf.fit(X_train, y_train)
#rf.score(X_test, y_test)

In [None]:
#X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf_feature, target, random_state=42)

In [None]:
#rf_tf = RandomForestClassifier(n_estimators=200)
#rf_tf = rf_tf.fit(X_train, y_train)
#rf_tf.score(X_test, y_test)