In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
stemmer = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 1.0)
data.columns = ['type', 'posts']

In [3]:
X = data['posts'].values
y = data['type'].values

In [4]:
posts = []

for text in range(0, len(X)):
    
    # This removes the special characters
    post = re.sub(r'\W', ' ', str(X[text]))
    
    # This removes multiple spaces
    post = re.sub(r'\s+', ' ', post, flags=re.I)
    
    # This converts all text to lower case
    post = post.lower()
    
    # Time to lemmatize
    post = post.split()

    post = [stemmer.lemmatize(word) for word in post]
    post = ' '.join(post)
    
    posts.append(post)

In [6]:
count_vect = CountVectorizer(max_features=1500, stop_words=stopwords)
X = count_vect.fit_transform(posts).toarray()

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) 


In [8]:
rf_class = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_class.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [9]:
y_pred = rf_class.predict(X_test)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[  0   2   0   0   0   0   0   0  11  21   0   6   0   0   0   0]
 [  0  76   0   1   0   0   0   0  22  39  11   5   0   0   0   0]
 [  0   1   4   1   0   0   0   0   7  21  13   9   0   0   0   1]
 [  0   4   0  81   0   0   0   0  12  28   8  25   0   0   0   0]
 [  0   0   0   2   0   0   0   0   2   4   0   2   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   3   0   2   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   5   1   3   0   0   0   0]
 [  0   0   0   2   0   0   0   0   9   7   4   6   0   0   0   2]
 [  0   4   0   6   0   0   0   0 289  71   7   6   0   0   0   1]
 [  0   3   0   4   0   0   0   0  13 414   7  11   0   0   0   0]
 [  0   0   0   0   0   0   0   0  20  44 168  29   0   0   0   0]
 [  0   1   0   5   0   0   0   0   9  54  12 275   0   0   0   0]
 [  0   0   0   0   0   0   0   0  13  24   1   8   2   0   0   0]
 [  0   3   0   1   0   0   0   0  11  38   6   7   0   2   0   0]
 [  0   1   0   0   0   0   0   0   5  25   3   8   0   0   1 

  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import pickle


In [12]:
# yo uname the file here
# with open('mbti_model.pickle', 'wb') as f:
#      pickle.dump(rf_class, f)
        
# with open('vector.pickle', 'wb') as f:
#     pickle.dump(count_vect,f)