# Project 5 MBTI Label words

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk 
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
from wordcloud import WordCloud

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhengfeichen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhengfeichen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhengfeichen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Import the data and check / clean the data first

In [3]:
df = pd.read_csv('../data/mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [8]:
df.info()
## No need to remove na value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [7]:
df.type.value_counts()

type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64

An imbalanced dataset needs process then.

### Process our X (df.posts)

In [14]:
def lemma_sentence(post):
    lemmatizer = WordNetLemmatizer()
    token_words = word_tokenize(post)
    lemmatize_sentence = [lemmatizer.lemmatize(word) for word in token_words] 
    return " ".join(lemmatize_sentence)

def remove_stopwords(post):
    tokens = word_tokenize(post)
    stop = stopwords.words('english')
    filtered_tokens = [word for word in tokens if word.lower() not in stop] 
    return ' '.join(filtered_tokens)

In [15]:
X_stem = df.posts.apply(lemma_sentence)

In [17]:
X_stem = X_stem.apply(remove_stopwords)

In [18]:
X_stem

0       'http : //www.youtube.com/watch ? v=qsXHcwe3kr...
1       ' 'm finding lack post alarming.|||Sex boring ...
2       'Good one _____ http : //www.youtube.com/watch...
3       'Dear INTP , enjoyed conversation day . Esoter...
4       'You 're fired.|||That 's another silly miscon...
                              ...                        
8670    'https : //www.youtube.com/watch ? v=t8edHB_h9...
8671    'So ... thread already exists someplace else (...
8672    'So many question thing . would take purple pi...
8673    ' conflicted right come wanting child . honest...
8674    'It ha long since personalitycafe - although d...
Name: posts, Length: 8675, dtype: object

### Label our y (mbti type)

In [19]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['type'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping

{'ENFJ': 0,
 'ENFP': 1,
 'ENTJ': 2,
 'ENTP': 3,
 'ESFJ': 4,
 'ESFP': 5,
 'ESTJ': 6,
 'ESTP': 7,
 'INFJ': 8,
 'INFP': 9,
 'INTJ': 10,
 'INTP': 11,
 'ISFJ': 12,
 'ISFP': 13,
 'ISTJ': 14,
 'ISTP': 15}

### Train test group split

In [20]:
X_dev, X_test, y_dev, y_test = train_test_split(X_stem, y, test_size=0.2, random_state=42)

In [23]:
pd.Series(y_dev).value_counts()

9     1462
8     1182
11    1011
10     898
1      550
3      550
15     270
13     218
2      187
14     161
0      149
12     121
7       74
5       40
4       35
6       32
Name: count, dtype: int64

### Tf-IDF & Smote

In [24]:
tfidf_vectorizer = TfidfVectorizer()
X_dev = tfidf_vectorizer.fit_transform(X_dev)
X_test = tfidf_vectorizer.transform(X_test)
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_dev, y_dev)

In [27]:
pd.Series(y_smote).value_counts()

8     1462
1     1462
3     1462
9     1462
0     1462
10    1462
2     1462
11    1462
4     1462
13    1462
15    1462
12    1462
6     1462
7     1462
14    1462
5     1462
Name: count, dtype: int64

### Model training

#### Logistic

#### Random Forest

#### XGBoost

#### SVM

#### Neural Network