In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.1)
data.columns = ['type', 'posts']

In [3]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
8329,INFJ,'https://i.chzbgr.com/completestore/12/9/12/4J...,99.58
1070,ISTP,"'I mean, unless they are your friends/family? ...",135.72
5018,INTJ,"'I am in a home now, I signing up for CCC(Cons...",101.82
7003,INFJ,'Stop being so emotional. Stop being silly. Do...,117.38
5053,ENFP,"'KOREAN <3 mmmmm|||No no no you guys, I mean a...",136.04
...,...,...,...
4563,INFP,"'Heartsick and conflicted, just like I have be...",118.72
3597,ENFJ,"'Well, as for me. I just want to go away. I ...",129.20
5709,ENTJ,'This is a question for those who have dealt w...,136.20
1124,INTP,"'I have asthma so no, I think I'd die the firs...",133.30


In [4]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
8329,INFJ,'https://i.chzbgr.com/completestore/12/9/12/4J...,99.58,4979
1070,ISTP,"'I mean, unless they are your friends/family? ...",135.72,6786
5018,INTJ,"'I am in a home now, I signing up for CCC(Cons...",101.82,5091
7003,INFJ,'Stop being so emotional. Stop being silly. Do...,117.38,5869
5053,ENFP,"'KOREAN <3 mmmmm|||No no no you guys, I mean a...",136.04,6802
...,...,...,...,...
4563,INFP,"'Heartsick and conflicted, just like I have be...",118.72,5936
3597,ENFJ,"'Well, as for me. I just want to go away. I ...",129.20,6460
5709,ENTJ,'This is a question for those who have dealt w...,136.20,6810
1124,INTP,"'I have asthma so no, I think I'd die the firs...",133.30,6665


In [5]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
8329,INFJ,'https://i.chzbgr.com/completestore/12/9/12/4J...,99.58,4979,9.3
1070,ISTP,"'I mean, unless they are your friends/family? ...",135.72,6786,7.4
5018,INTJ,"'I am in a home now, I signing up for CCC(Cons...",101.82,5091,6.6
7003,INFJ,'Stop being so emotional. Stop being silly. Do...,117.38,5869,8.2
5053,ENFP,"'KOREAN <3 mmmmm|||No no no you guys, I mean a...",136.04,6802,8.6
...,...,...,...,...,...
4563,INFP,"'Heartsick and conflicted, just like I have be...",118.72,5936,7.8
3597,ENFJ,"'Well, as for me. I just want to go away. I ...",129.20,6460,11.0
5709,ENTJ,'This is a question for those who have dealt w...,136.20,6810,8.1
1124,INTP,"'I have asthma so no, I think I'd die the firs...",133.30,6665,8.7


In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,INFJ,'https://i.chzbgr.com/completestore/12/9/12/4J...,99.58,4979,9.3
1,ISTP,"'I mean, unless they are your friends/family? ...",135.72,6786,7.4
2,INTJ,"'I am in a home now, I signing up for CCC(Cons...",101.82,5091,6.6
3,INFJ,'Stop being so emotional. Stop being silly. Do...,117.38,5869,8.2
4,ENFP,"'KOREAN <3 mmmmm|||No no no you guys, I mean a...",136.04,6802,8.6
...,...,...,...,...,...
863,INFP,"'Heartsick and conflicted, just like I have be...",118.72,5936,7.8
864,ENFJ,"'Well, as for me. I just want to go away. I ...",129.20,6460,11.0
865,ENTJ,'This is a question for those who have dealt w...,136.20,6810,8.1
866,INTP,"'I have asthma so no, I think I'd die the firs...",133.30,6665,8.7


In [7]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
# # create function to clean the posts with STEMMER
# def clean_posts(post):
#     post = "".join([word.lower()for word in post if word not in string.punctuation])
#     tokens = re.split('\W+', post)
#     post = [ps.stem(word) for word in tokens if word not in stopwords]
#     return post

# #TF-IDF
# # tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
# # X_tf = tf_vectorize.fit_transform(data['posts'])
# # X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# # #Count Vectorizer
# count_vectorize = CountVectorizer(analyzer = clean_posts)
# X_count = count_vectorize.fit_transform(data['posts'])
# X_count_save = np.array(X_count)
# # X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)
# X_count_feature = pd.DataFrame(X_count.toarray())
# X_count_feature.head()
# X_count_save

In [9]:
# create function to clean the posts
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [lm.lemmatize(word) for word in tokens if word not in stopwords]
    return post

#TF-IDF
# tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
# X_tf = tf_vectorize.fit_transform(data['posts'])
# X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(data['posts'])
X_count_save = np.array(X_count)
# X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_feature = pd.DataFrame(X_count.toarray())
X_count_feature.head()
X_count_save

array(<868x61776 sparse matrix of type '<class 'numpy.int64'>'
	with 381253 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [18]:
X_count

<868x61776 sparse matrix of type '<class 'numpy.int64'>'
	with 381253 stored elements in Compressed Sparse Row format>

In [10]:
X_count_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61766,61767,61768,61769,61770,61771,61772,61773,61774,61775
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
866,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X_count

<868x61776 sparse matrix of type '<class 'numpy.int64'>'
	with 381253 stored elements in Compressed Sparse Row format>

In [12]:
target = data['type']

In [13]:
target

0      INFJ
1      ISTP
2      INTJ
3      INFJ
4      ENFP
       ... 
863    INFP
864    ENFJ
865    ENTJ
866    INTP
867    INFP
Name: type, Length: 868, dtype: object

In [14]:
X_count_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61766,61767,61768,61769,61770,61771,61772,61773,61774,61775
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
866,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
rf=RandomForestClassifier()
param={'n_estimators':[800],'max_depth':[None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,121.608719,32.871946,1.609174,0.356325,,800,"{'max_depth': None, 'n_estimators': 800}",0.373563,0.367816,0.396552,0.398844,0.404624,0.38828,0.014714,1


In [None]:
# from sklearn.model_selection import train_test_split
# train_set, test_set = train_test_split(data, test_size =0.2)

In [17]:
import pickle
import joblib

In [19]:
filename_vect = 'vectorizer.sav'
joblib.dump(X_count, filename_vect)

filename_class = 'gs_rf_model.sav'
joblib.dump(gs_fit, filename_class)

['gs_rf_model.sav']

In [None]:
with open('picklefile.pickle', 'rb') as f:
    loaded_vars = pickle.load(f)

In [22]:
vectorizer = pickle.load('vectorizer.sav','rb')
model = pickle.load(open('gs_rf_model.sav','rb'))
pred = model.predict(vectorizer.transform(test['posts']))
print ("predicted class:", pred)

TypeError: Function takes at most 1 positional arguments (2 given)

In [20]:
test = pd.DataFrame({"posts":["I think I can I think I can I am not sure"]})

test

Unnamed: 0,posts
0,I think I can I think I can I am not sure


In [None]:
gs = joblib.load("../count_vect_model_w_lemm.sav")

In [None]:
import pickle

a = pd.DataFrame()
b = "this can be your vectorizer thing"

In [None]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post


# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(test['posts'])
X_count_feature = pd.DataFrame(X_count.toarray())

X_count_feature.head()

In [None]:
gs.predict(X_count_feature)

In [None]:
# yo uname the file here
with open('picklefile.pickle', 'wb') as f:
    pickle.dump([a, b], f)

In [None]:
with open('picklefile.pickle', 'rb') as f:
    loaded_vars = pickle.load(f)

In [None]:
loaded_vars

In [None]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()