In [9]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [10]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.2)
data.columns = ['type', 'posts']

In [3]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
5119,ENFP,'...now I have something to say. I happened ...,125.26
7427,ENTJ,'I don't understand why new people who have li...,132.64
5342,ENTP,'I didn't see this thread in the ENTP forum fo...,53.26
4456,ENFP,"'Exactly! It's like, Fi says, Okay, this is wh...",139.80
3826,ENFP,'So I thought I was an ENFP but now I'm not so...,103.10
...,...,...,...
2780,INFP,'This song rocks!|||What happened to all my ga...,108.98
4735,INTP,"'Well, when I got my current car, and hadn't h...",138.72
1994,INTP,"Yo who you callin a bitch ya lil ho ass ho, bo...",76.46
490,ENTJ,INTJs and ENFJs definitely. ENTJs would piss m...,118.98


In [4]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
5119,ENFP,'...now I have something to say. I happened ...,125.26,6263
7427,ENTJ,'I don't understand why new people who have li...,132.64,6632
5342,ENTP,'I didn't see this thread in the ENTP forum fo...,53.26,2663
4456,ENFP,"'Exactly! It's like, Fi says, Okay, this is wh...",139.80,6990
3826,ENFP,'So I thought I was an ENFP but now I'm not so...,103.10,5155
...,...,...,...,...
2780,INFP,'This song rocks!|||What happened to all my ga...,108.98,5449
4735,INTP,"'Well, when I got my current car, and hadn't h...",138.72,6936
1994,INTP,"Yo who you callin a bitch ya lil ho ass ho, bo...",76.46,3823
490,ENTJ,INTJs and ENFJs definitely. ENTJs would piss m...,118.98,5949


In [5]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
5119,ENFP,'...now I have something to say. I happened ...,125.26,6263,8.4
7427,ENTJ,'I don't understand why new people who have li...,132.64,6632,8.2
5342,ENTP,'I didn't see this thread in the ENTP forum fo...,53.26,2663,8.1
4456,ENFP,"'Exactly! It's like, Fi says, Okay, this is wh...",139.80,6990,9.3
3826,ENFP,'So I thought I was an ENFP but now I'm not so...,103.10,5155,11.5
...,...,...,...,...,...
2780,INFP,'This song rocks!|||What happened to all my ga...,108.98,5449,10.6
4735,INTP,"'Well, when I got my current car, and hadn't h...",138.72,6936,8.0
1994,INTP,"Yo who you callin a bitch ya lil ho ass ho, bo...",76.46,3823,9.3
490,ENTJ,INTJs and ENFJs definitely. ENTJs would piss m...,118.98,5949,8.2


In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ENFP,'...now I have something to say. I happened ...,125.26,6263,8.4
1,ENTJ,'I don't understand why new people who have li...,132.64,6632,8.2
2,ENTP,'I didn't see this thread in the ENTP forum fo...,53.26,2663,8.1
3,ENFP,"'Exactly! It's like, Fi says, Okay, this is wh...",139.80,6990,9.3
4,ENFP,'So I thought I was an ENFP but now I'm not so...,103.10,5155,11.5
...,...,...,...,...,...
1730,INFP,'This song rocks!|||What happened to all my ga...,108.98,5449,10.6
1731,INTP,"'Well, when I got my current car, and hadn't h...",138.72,6936,8.0
1732,INTP,"Yo who you callin a bitch ya lil ho ass ho, bo...",76.46,3823,9.3
1733,ENTJ,INTJs and ENFJs definitely. ENTJs would piss m...,118.98,5949,8.2


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
stopwords =nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#nltk.download()

In [9]:
import gc

In [10]:
gc.collect()

0

In [11]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [12]:
# create function to clean the posts
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post

#TF-IDF
tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
X_tf = tf_vectorize.fit_transform(data['posts'])
X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(data['posts'])
X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feature.head()

Unnamed: 0,avg_post_len,punct_%,0,1,2,3,4,5,6,7,...,91007,91008,91009,91010,91011,91012,91013,91014,91015,91016
0,125.26,8.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,132.64,8.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53.26,8.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,139.8,9.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,103.1,11.5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
target = data['type']

In [14]:
target

0       ENFP
1       ENTJ
2       ENTP
3       ENFP
4       ENFP
        ... 
1730    INFP
1731    INTP
1732    INTP
1733    ENTJ
1734    ISFJ
Name: type, Length: 1735, dtype: object

In [15]:
X_count_feature

Unnamed: 0,avg_post_len,punct_%,0,1,2,3,4,5,6,7,...,91007,91008,91009,91010,91011,91012,91013,91014,91015,91016
0,125.26,8.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,132.64,8.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53.26,8.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,139.80,9.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,103.10,11.5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,108.98,10.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1731,138.72,8.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1732,76.46,9.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1733,118.98,8.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
rf=RandomForestClassifier()
param={'n_estimators':[200,400,600],'max_depth':[50,100,150,None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,74.214191,0.332034,1.676519,0.050568,,400,"{'max_depth': None, 'n_estimators': 400}",0.314121,0.371758,0.37464,0.368876,0.371758,0.360231,0.023127,1
3,39.099471,0.412625,1.600921,0.045434,100.0,200,"{'max_depth': 100, 'n_estimators': 200}",0.342939,0.365994,0.371758,0.314121,0.365994,0.352161,0.021442,2
4,73.161807,0.093747,1.626451,0.016429,100.0,400,"{'max_depth': 100, 'n_estimators': 400}",0.334294,0.345821,0.368876,0.342939,0.348703,0.348127,0.011441,3
0,39.275001,0.373667,1.899522,0.154962,50.0,200,"{'max_depth': 50, 'n_estimators': 200}",0.305476,0.345821,0.354467,0.363112,0.365994,0.346974,0.021917,4
5,108.618016,0.866397,1.76209,0.053935,100.0,600,"{'max_depth': 100, 'n_estimators': 600}",0.322767,0.337176,0.357349,0.345821,0.363112,0.345245,0.014398,5


In [18]:
rf=RandomForestClassifier()
param={'n_estimators':[250,500,750],'max_depth':[100,200,300,None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,121.398407,0.578197,2.708161,0.063433,300.0,500,"{'max_depth': 300, 'n_estimators': 500}",0.337176,0.357349,0.386167,0.360231,0.377522,0.363689,0.017039,1
10,121.418199,0.610861,2.63256,0.038247,,500,"{'max_depth': None, 'n_estimators': 500}",0.331412,0.357349,0.389049,0.354467,0.365994,0.359654,0.018623,2
4,143.578436,14.302722,1.786225,0.137553,200.0,500,"{'max_depth': 200, 'n_estimators': 500}",0.331412,0.348703,0.37464,0.365994,0.365994,0.357349,0.015466,3
8,178.539068,0.989717,2.85058,0.052967,300.0,750,"{'max_depth': 300, 'n_estimators': 750}",0.337176,0.363112,0.365994,0.363112,0.334294,0.352738,0.013952,4
11,169.5492,13.847425,2.458237,0.42477,,750,"{'max_depth': None, 'n_estimators': 750}",0.32853,0.360231,0.360231,0.354467,0.360231,0.352738,0.012308,4


In [5]:
gs = joblib.load("../count_vect_model.sav")

In [12]:
test = pd.DataFrame({"posts":["I think I can I think I can I am not sure"]})

test

Unnamed: 0,posts
0,I think I can I think I can I am not sure


In [13]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post


# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(test['posts'])
X_count_feature = pd.DataFrame(X_count.toarray())

X_count_feature.head()

Unnamed: 0,0,1
0,1,2


In [14]:
gs.predict(X_count_feature)

ValueError: Number of features of the model must match the input. Model n_features is 210033 and input n_features is 2 

In [None]:
rf=RandomForestClassifier()
param={'n_estimators':[250,500,750],'max_depth':[400,300,None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

In [None]:
rf=RandomForestClassifier()
param={'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

gs_tf=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_tf_fit=gs_tf.fit(X_tf_feature,data['type'])
pd.DataFrame(gs_tf_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

In [3]:
import joblib

In [None]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

In [None]:
filename = 'count_vect_model.sav'
joblib.dump(gs_fit, filename)

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X_count_feature, target, random_state=63)

In [None]:
#rf = RandomForestClassifier(n_estimators=200)
#rf = rf.fit(X_train, y_train)
#rf.score(X_test, y_test)

In [None]:
#X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf_feature, target, random_state=42)

In [None]:
#rf_tf = RandomForestClassifier(n_estimators=200)
#rf_tf = rf_tf.fit(X_train, y_train)
#rf_tf.score(X_test, y_test)

In [22]:
import pickle

a = pd.DataFrame()
b = "this can be your vectorizer thing"

In [23]:
# yo uname the file here
with open('picklefile.pickle', 'wb') as f:
    pickle.dump([a, b], f)

In [24]:
with open('picklefile.pickle', 'rb') as f:
    loaded_vars = pickle.load(f)

In [25]:
loaded_vars

[Empty DataFrame
 Columns: []
 Index: [],
 'this can be your vectorizer thing']