In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
stemmer = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.1)
data.columns = ['type', 'posts']

In [3]:
X = data['posts'].values
y = data['type'].values

In [4]:
posts = []

for text in range(0, len(X)):
    # Remove all the special characters
    post = re.sub(r'\W', ' ', str(X[text]))
    
    # remove all single characters
    post = re.sub(r'\s+[a-zA-Z]\s+', ' ', post)
    
    # Remove single characters from the start
    post = re.sub(r'\^[a-zA-Z]\s+', ' ', post) 
    
    # Substituting multiple spaces with single space
    post = re.sub(r'\s+', ' ', post, flags=re.I)
    
    # Removing prefixed 'b'
#     post = re.sub(r'^b\s+', '', post)
    
    # Converting to Lowercase
    post = post.lower()
    
    # Lemmatization
    post = post.split()

    post = [stemmer.lemmatize(word) for word in post]
    post = ' '.join(post)
    
    posts.append(post)

In [5]:
# # removing piping
# data['posts']= data['posts'].str.replace('|',' ')

# # removing '
# data['posts']= data['posts'].str.replace("'",'')

# # removing url's from posts
# data['posts'] = data['posts'].str.replace('http\S+|www.\S+', '', case=False)

# # change case to lower
# data['posts'] = data['posts'].str.lower()


# #remove punctuation from posts

# def remove_punctuation(text):
#     no_punctuation = "".join([char for char in text if char not in string.punctuation])
#     return no_punctuation

# data['body_text_clean'] = data['posts'].apply(lambda x: remove_punctuation(x))

# # pulling types from type column
# mbti_types = data['type'].unique()

# # types to list instead of array
# mbti_list = mbti_types.tolist()

# # lowercasing types
# mbti_new = [x.lower() for x in mbti_list]

# # remove references to mbti type in body_text_clean column

# for item in mbti_new:
#     data['body_text_clean'] = data['body_text_clean'].str.replace(item , "")
    
# # # apply word_tokenize to all records
# # from nltk.tokenize import word_tokenize

# # data['tokenized'] = data['body_text_clean'].apply(word_tokenize)


# # # remove stopwords
# # def stopword_removal(text):
# #     stop_words = [item for item in text if item not in stopwords]
# #     return stop_words

# # data['stopwords'] = data['tokenized'].apply(lambda x: stopword_removal(x))

# def lemma_words(lemma):
#     lemmatize = [lm.lemmatize(word) for word in lemma]
#     return lemmatize

# data['lemmatized'] = data['body_text_clean'].apply(lambda x: lemma_words(x))



In [6]:
count_vect = CountVectorizer(max_features=1500, stop_words=stopwords)
X = count_vect.fit_transform(posts).toarray()

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) 


In [8]:
rf_class = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_class.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [9]:
y_pred = rf_class.predict(X_test)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 0  0  0  0  0  0  0  2  1  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0  3  3  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  1  0  0  0  0]
 [ 0  0  0  5  0  0  0  5  7  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  1  0  0  0  0]
 [ 0  1  0  1  0  0  0 23 18  2  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  1 44  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  6  7 12  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  4  0 26  0  0  0  0]
 [ 0  0  0  0  0  0  0  2  4  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  4  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  5  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  2  5  0  1  0  0  0  0]]
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00         3
        ENFP       0.67      0.20      0.31        10
        ENTJ       0.00      0.00      0.00         2
        ENTP       0.83      0.25      0.38        20
        

  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import pickle


In [12]:
# yo uname the file here
with open('mbti_model.pickle', 'wb') as f:
     pickle.dump(rf_class, f)
        
with open('vector.pickle', 'wb') as f:
    pickle.dump(count_vect,f)