In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [20]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.3)
data.columns = ['type', 'posts']

In [21]:
import string
#calculating the average post length
data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
data

Unnamed: 0,type,posts,avg_post_len
5530,ENFJ,"'YES! I absolutely love this one, I've stumble...",152.48
2705,INFJ,'Insert random mainstream movie|||When I worke...,144.94
7242,INFP,'*breathes in* *breathes out* ...That's nice...,152.30
4,ENTJ,'You're fired.|||That's another silly misconce...,102.50
2945,INFJ,'Disorder Rating Information Paranoid: Mo...,138.10
...,...,...,...
7975,ENFP,'That could create some kind of social movemen...,163.30
6989,ENFP,'Not gonna happen. Genetically inferior is a ...,127.92
625,INFJ,"'Yeah, you can spot them indirectly. Generally...",106.22
2829,ISFP,"'I like to draw (appreciate art in general), l...",100.60


In [22]:
#calculating the total post length
data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len
5530,ENFJ,"'YES! I absolutely love this one, I've stumble...",152.48,7624
2705,INFJ,'Insert random mainstream movie|||When I worke...,144.94,7247
7242,INFP,'*breathes in* *breathes out* ...That's nice...,152.30,7615
4,ENTJ,'You're fired.|||That's another silly misconce...,102.50,5125
2945,INFJ,'Disorder Rating Information Paranoid: Mo...,138.10,6905
...,...,...,...,...
7975,ENFP,'That could create some kind of social movemen...,163.30,8165
6989,ENFP,'Not gonna happen. Genetically inferior is a ...,127.92,6396
625,INFJ,"'Yeah, you can spot them indirectly. Generally...",106.22,5311
2829,ISFP,"'I like to draw (appreciate art in general), l...",100.60,5030


In [23]:
# calculating the punctuation percentage
def punct_count(post):
    count = sum([1 for char in post if char in string.punctuation])
    return round(count/(len(post) - post.count(" ")), 3)*100

data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
5530,ENFJ,"'YES! I absolutely love this one, I've stumble...",152.48,7624,7.4
2705,INFJ,'Insert random mainstream movie|||When I worke...,144.94,7247,6.7
7242,INFP,'*breathes in* *breathes out* ...That's nice...,152.30,7615,8.5
4,ENTJ,'You're fired.|||That's another silly misconce...,102.50,5125,8.6
2945,INFJ,'Disorder Rating Information Paranoid: Mo...,138.10,6905,7.3
...,...,...,...,...,...
7975,ENFP,'That could create some kind of social movemen...,163.30,8165,7.7
6989,ENFP,'Not gonna happen. Genetically inferior is a ...,127.92,6396,7.9
625,INFJ,"'Yeah, you can spot them indirectly. Generally...",106.22,5311,9.8
2829,ISFP,"'I like to draw (appreciate art in general), l...",100.60,5030,13.3


In [24]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts,avg_post_len,tot_post_len,punct_%
0,ENFJ,"'YES! I absolutely love this one, I've stumble...",152.48,7624,7.4
1,INFJ,'Insert random mainstream movie|||When I worke...,144.94,7247,6.7
2,INFP,'*breathes in* *breathes out* ...That's nice...,152.30,7615,8.5
3,ENTJ,'You're fired.|||That's another silly misconce...,102.50,5125,8.6
4,INFJ,'Disorder Rating Information Paranoid: Mo...,138.10,6905,7.3
...,...,...,...,...,...
2597,ENFP,'That could create some kind of social movemen...,163.30,8165,7.7
2598,ENFP,'Not gonna happen. Genetically inferior is a ...,127.92,6396,7.9
2599,INFJ,"'Yeah, you can spot them indirectly. Generally...",106.22,5311,9.8
2600,ISFP,"'I like to draw (appreciate art in general), l...",100.60,5030,13.3


In [25]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [26]:
# # create function to clean the posts with STEMMER
# def clean_posts(post):
#     post = "".join([word.lower()for word in post if word not in string.punctuation])
#     tokens = re.split('\W+', post)
#     post = [ps.stem(word) for word in tokens if word not in stopwords]
#     return post

# #TF-IDF
# # tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
# # X_tf = tf_vectorize.fit_transform(data['posts'])
# # X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# # #Count Vectorizer
# count_vectorize = CountVectorizer(analyzer = clean_posts)
# X_count = count_vectorize.fit_transform(data['posts'])
# X_count_save = np.array(X_count)
# # X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)
# X_count_feature = pd.DataFrame(X_count.toarray())
# X_count_feature.head()
# X_count_save

In [27]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post

In [28]:
from sklearn.model_selection import train_test_split
X = data['posts'].values
y = data['type'].values

X_train, X_test, y_train, y_test = train_test_split(X, y) 


In [29]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
        ('vect',CountVectorizer(analyzer = clean_posts)),
        ('clf', RandomForestClassifier())
        
])

pipe_parms = [{
    'n_estimators' : [800],
    'max_depth' : [None]
}]

pipe.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function clean_posts at 0x000001D51E3AFD08>)),
                ('clf', RandomForestClassifier())])

In [30]:
#gs = GridSearchCV(pipe, param_grid= pipe_parms, cv=5)

In [31]:
# Predict training data
y_train_pred = pipe.predict(X_train)
print(f"Predictions on training data: {y_train_pred}")


Predictions on training data: ['ISTJ' 'INTJ' 'INTP' ... 'INFP' 'ENFJ' 'ISFJ']


In [32]:
# Predict test data
y_test_pred = pipe.predict(X_test)
print(f"Predictions on test data: {y_test_pred}")

Predictions on test data: ['INFJ' 'INFP' 'INFP' 'INTJ' 'INTJ' 'INTP' 'INFJ' 'INFP' 'INFP' 'INFJ'
 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFP'
 'INFP' 'INTJ' 'INFP' 'INTP' 'INTJ' 'INFJ' 'INFP' 'INFP' 'INTJ' 'INFP'
 'INFJ' 'INFP' 'INFJ' 'INFP' 'INFP' 'INTJ' 'INTP' 'INTJ' 'INTJ' 'INFP'
 'INFP' 'INTJ' 'INTP' 'INFP' 'INFP' 'INFJ' 'INFJ' 'INFP' 'INFP' 'INFP'
 'INFJ' 'INTP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFJ' 'INFJ' 'INTP'
 'INTJ' 'INFP' 'INTP' 'INFP' 'INTP' 'INTP' 'INTP' 'INTP' 'INFP' 'INFP'
 'INFP' 'INTP' 'INFP' 'INTJ' 'INFP' 'INFP' 'INFJ' 'INFJ' 'INTP' 'ENTP'
 'INFJ' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INTP' 'INTP'
 'INFP' 'INFP' 'INTP' 'INFP' 'INFP' 'INFJ' 'INFP' 'INFJ' 'INTP' 'INTP'
 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP' 'INTP'
 'INFJ' 'INFP' 'INTJ' 'INFP' 'INTJ' 'INFP' 'INFP' 'INFP' 'INFP' 'INFP'
 'INTP' 'INFP' 'INTJ' 'INFP' 'INFP' 'INTP' 'INFJ' 'INFP' 'INFP' 'INFP'
 'INTP' 'INFP' 'INFJ' 'INFP' 'INFP' 'INFP' 'INTJ' '

In [33]:
pipe.score(X_test, y_test)

0.41781874039938555

In [None]:
# create function to clean the posts
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [lm.lemmatize(word) for word in tokens if word not in stopwords]
    return post

#TF-IDF
# tf_vectorize = TfidfVectorizer(analyzer=clean_posts)
# X_tf = tf_vectorize.fit_transform(data['posts'])
# X_tf_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_tf.toarray())], axis=1)

# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(data['posts'])
X_count_save = np.array(X_count)
# X_count_feature = pd.concat([data['avg_post_len'], data['punct_%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_feature = pd.DataFrame(X_count.toarray())
X_count_feature.head()
X_count_save

In [None]:
X_count

In [None]:
X_count_feature

In [None]:
X_count

In [None]:
target = data['type']

In [None]:
target

In [None]:
X_count_feature

In [None]:
rf=RandomForestClassifier()
param={'n_estimators':[800],'max_depth':[None]}

gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_count_feature,data['type'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

In [None]:
# from sklearn.model_selection import train_test_split
# train_set, test_set = train_test_split(data, test_size =0.2)

In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline


X = ['I am a sentence', 'an example']
Y = [1, 2]
X_dev = ['another sentence']

# classifier
LinearSVC1 = LinearSVC(tol=1e-4,  C = 0.10000000000000001)

pipeline = Pipeline([
    ('features', FeatureUnion([
       ('tfidf', TfidfVectorizer(ngram_range=(1, 3), max_features= 4000)), 
       ('custom_features', CustomFeatures())])),
    ('clf', LinearSVC1),
    ])

pipeline.fit(X, Y)
y_pred = pipeline.predict(X_dev)

# etc.

In [None]:
import pickle
import joblib

In [None]:
filename_vect = 'vectorizer.sav'
joblib.dump(X_count, filename_vect)

filename_class = 'gs_rf_model.sav'
joblib.dump(gs_fit, filename_class)

In [None]:
with open('picklefile.pickle', 'rb') as f:
    loaded_vars = pickle.load(f)

In [None]:
vectorizer = pickle.load('vectorizer.sav','rb')
model = pickle.load(open('gs_rf_model.sav','rb'))
pred = model.predict(vectorizer.transform(test['posts']))
print ("predicted class:", pred)

In [None]:
test = pd.DataFrame({"posts":["I think I can I think I can I am not sure"]})

test

In [None]:
gs = joblib.load("../count_vect_model_w_lemm.sav")

In [None]:
import pickle

a = pd.DataFrame()
b = "this can be your vectorizer thing"

In [None]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [ps.stem(word) for word in tokens if word not in stopwords]
    return post


# #Count Vectorizer
count_vectorize = CountVectorizer(analyzer = clean_posts)
X_count = count_vectorize.fit_transform(test['posts'])
X_count_feature = pd.DataFrame(X_count.toarray())

X_count_feature.head()

In [None]:
gs.predict(X_count_feature)

In [None]:
# yo uname the file here
with open('picklefile.pickle', 'wb') as f:
    pickle.dump([a, b], f)

In [None]:
with open('picklefile.pickle', 'rb') as f:
    loaded_vars = pickle.load(f)

In [None]:
loaded_vars

In [None]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()