In [110]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import spacy
from nltk.stem.snowball import SnowballStemmer

In [112]:
train = pd.read_csv('/content/drive/MyDrive/spp_project/mbti_concat.csv')

In [113]:
train

Unnamed: 0.1,Unnamed: 0,posts,type
0,0,know intj tool use interaction people excuse a...,INTJ
1,1,rap music ehh opp yeah know valid well know fa...,INTJ
2,2,preferably p hd low except wew lad video p min...,INTJ
3,3,drink like wish could drink red wine give head...,INTJ
4,4,space program ah bad deal meing freelance max ...,INTJ
...,...,...,...
114737,114737,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP
114738,114738,'So...if this thread already exists someplace ...,ENFP
114739,114739,'So many questions when i do these things. I ...,INTP
114740,114740,'I am very conflicted right now when it comes ...,INFP


In [114]:
train = train.drop(['Unnamed: 0'], axis=1)

In [115]:
train

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
114737,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP
114738,'So...if this thread already exists someplace ...,ENFP
114739,'So many questions when i do these things. I ...,INTP
114740,'I am very conflicted right now when it comes ...,INFP


In [116]:
test = train.drop(['type'], axis=1)
test.head()

Unnamed: 0,posts
0,know intj tool use interaction people excuse a...
1,rap music ehh opp yeah know valid well know fa...
2,preferably p hd low except wew lad video p min...
3,drink like wish could drink red wine give head...
4,space program ah bad deal meing freelance max ...


## **EDA**
1. 라벨 개수 확인

In [117]:
print(f"{len(train['type'].unique())}개")

16개


2. 라벨별 비율 확인

In [118]:
train['type'].value_counts()

INTP    26265
INTJ    23518
INFJ    16433
INFP    13966
ENTP    12410
ENFP     6842
ISTP     3761
ENTJ     3186
ESTP     2075
ENFJ     1724
ISTJ     1448
ISFP     1146
ISFJ      816
ESTJ      521
ESFP      408
ESFJ      223
Name: type, dtype: int64

In [119]:
# plt.pie(train['type'])

3. 결측치 확인

In [120]:
train.isnull().sum()

posts    0
type     0
dtype: int64

4. 데이터 중복 여부 확인

In [121]:
train['posts'].nunique() == len(train['posts'])

True

5. train, test 행렬 확인

In [122]:
print("train data : ", train.shape)
print("test data : ", test.shape)

train data :  (114742, 2)
test data :  (114742, 1)


6. 각 MBTI 글자별 빈도수 확인

In [123]:
# E, I 빈도수 확인
first = []
for i in range(len(train)):
    first.append(train['type'][i][0])
first = pd.DataFrame(first)
first[0].value_counts()

I    87353
E    27389
Name: 0, dtype: int64

In [124]:
# N, S 빈도수 확인
second = []
for i in range(len(train)):
    second.append(train['type'][i][1])
second = pd.DataFrame(second)
second[0].value_counts()

N    104344
S     10398
Name: 0, dtype: int64

In [125]:
# T, F 빈도수 확인
third = []
for i in range(len(train)):
    third.append(train['type'][i][2])
third = pd.DataFrame(third)
third[0].value_counts()

T    73184
F    41558
Name: 0, dtype: int64

In [126]:
# P, J 빈도수 확인
fourth = []
for i in range(len(train)):
    fourth.append(train['type'][i][3])
fourth = pd.DataFrame(fourth)
fourth[0].value_counts()

P    66873
J    47869
Name: 0, dtype: int64

## **Split data**

In [127]:
X = train[['posts']] # data features
Y = train[['type']] # labels
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=1) # test size = 0.3도 해보기

In [128]:
y_train.value_counts()

type
INTP    21066
INTJ    18732
INFJ    13118
INFP    11232
ENTP     9927
ENFP     5490
ISTP     3026
ENTJ     2526
ESTP     1649
ENFJ     1366
ISTJ     1169
ISFP      905
ISFJ      669
ESTJ      403
ESFP      335
ESFJ      180
dtype: int64

# **데이터 클래스 불균형 해결-Undersampling**

In [129]:
!pip install imbalanced-learn



In [130]:
from imblearn.under_sampling import RandomUnderSampler

X_resampled, y_resampled = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)

In [131]:
X_resampled = X_resampled.to_numpy()
y_resampled = y_resampled.to_numpy()
X_valid = X_valid.to_numpy()
y_valid = y_valid.to_numpy()

In [132]:
X_resampled_reshape = X_resampled.ravel()
y_resampled_reshape = y_resampled.ravel()
X_valid_reshape = X_valid.ravel()
y_valid_reshape = y_valid.ravel()

In [133]:
X = X.to_numpy()
Y = Y.to_numpy()
X_reshape = X.ravel()
Y_reshape = Y.ravel()

In [134]:
# X_resampled_reshape = pd.DataFrame(X_resampled, columns = ['type'])
# y_resampled_reshape = pd.DataFrame(y_resampled, columns = ['type'])
# X_valid_reshape = pd.DataFrame(X_valid, columns = ['type'])
# y_valid_reshape = pd.DataFrame(y_valid, columns = ['type'])

In [135]:
import seaborn as sns

In [136]:
y_resampled_reshape

array(['ENFJ', 'ENFJ', 'ENFJ', ..., 'ISTP', 'ISTP', 'ISTP'], dtype=object)

In [138]:
X_resampled_reshape

array(['project order compare two could argue like christianity judaism term relate share lot root uimately come much different conclusion guess esfj feel similar personality type quite many thrift store love majority wardrobe market lot unique cheap sometimes bnwt find okay talaga bang sure ka na miss back baka naman naghihintay lang siya ng move mo hehe process type refer process v resu dichotomy interest stuff imo data visualize anecdotally mom stepmom esfjs personality disorder former borderline latter obsessive compulsive personality disorder skeptical read guy sensitive tend overreact lol way phrase thing tell hilarious ask question yet also attack base nice logical read another example might sound like base research would say asshole would admit asshole would accept research say asshole question unlikely yield useful answer even phrase overly sensitive overreact judgement imply friend level sensitivity reaction wrong accord standard leave room mode exist short way always communi

In [139]:
y_resampled_reshape

array(['ENFJ', 'ENFJ', 'ENFJ', ..., 'ISTP', 'ISTP', 'ISTP'], dtype=object)

## **TF-IDF Vectorizer**

*   TF(Term Frequency) : 특정 단어가 등장하는 횟수
*   IDF(Inverse Document Frequency) : 특정 단어가 몇 개의 Document에서 등장하는지의 역수


*   TF-IDF = TF*IDF



In [140]:
# 벡터화
tfidf = TfidfVectorizer()

# 훈련 데이터 벡터화
X_train_tfidf = tfidf.fit_transform(X_resampled_reshape)

Grid Search

In [141]:
clf = LinearSVC()
# 정확도 기준 설정
cv = GridSearchCV(clf, {'C': [0.1, 0.3, 0.5, 1.0]}, scoring = "accuracy")

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',cv)])
text_clf.fit(X_resampled_reshape, y_resampled_reshape)

C = cv.best_estimator_.C

In [142]:
print("최적의 파라미터 C : ", C)

최적의 파라미터 C :  0.3


2. Validation Accuracy 확인

In [143]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=0.3))])
text_clf.fit(X_resampled_reshape, y_resampled_reshape)

In [144]:
# valid 데이터의 mbti 예측
pred = text_clf.predict(X_valid_reshape)

In [145]:
# valid data에서의 정확도
accuracy_score(pred, y_valid_reshape)

0.6820776504422851

In [146]:
from sklearn.metrics import classification_report
print(classification_report(y_valid_reshape, pred))

              precision    recall  f1-score   support

        ENFJ       0.44      0.72      0.55       358
        ENFP       0.63      0.73      0.68      1352
        ENTJ       0.60      0.80      0.69       660
        ENTP       0.80      0.68      0.73      2483
        ESFJ       0.13      0.84      0.22        43
        ESFP       0.23      0.88      0.37        73
        ESTJ       0.66      0.82      0.73       118
        ESTP       0.49      0.90      0.63       426
        INFJ       0.78      0.65      0.71      3315
        INFP       0.78      0.65      0.71      2734
        INTJ       0.80      0.66      0.72      4786
        INTP       0.81      0.67      0.73      5199
        ISFJ       0.21      0.78      0.33       147
        ISFP       0.23      0.71      0.35       241
        ISTJ       0.24      0.75      0.37       279
        ISTP       0.60      0.77      0.67       735

    accuracy                           0.68     22949
   macro avg       0.53   

In [147]:
y_valid_list = list(y_valid_reshape)

In [148]:
num = 0
for i in range(len(y_valid_reshape)):
    if y_valid_list[i][0] == pred[i][0]:
        num+=1
print("첫 번째 글자의 정확도 : ", num/len(y_valid_reshape))


첫 번째 글자의 정확도 :  0.860734672534751


In [149]:
num = 0
for i in range(len(y_valid_reshape)):
    if y_valid_list[i][1] == pred[i][1]:
        num+=1
print("두 번째 글자의 정확도 : ", num/len(y_valid_reshape))

두 번째 글자의 정확도 :  0.875419408253083


In [150]:
num = 0
for i in range(len(y_valid_reshape)):
    if y_valid_list[i][2] == pred[i][2]:
        num+=1
print("세 번째 글자의 정확도 : ", num/len(y_valid_reshape))

세 번째 글자의 정확도 :  0.880343370081485


In [151]:
num = 0
for i in range(len(y_valid_reshape)):
    if y_valid_list[i][3] == pred[i][3]:
        num+=1
print("네 번째 글자의 정확도 : ", num/len(y_valid_reshape))

네 번째 글자의 정확도 :  0.8223452002265894


In [153]:
# 모든 설명변수 데이터 X 자연어처리
X_tfidf = tfidf.fit_transform(X_reshape)

In [154]:
svc_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=0.3))])
svc_clf.fit(X_reshape, Y_reshape)

In [155]:
pred_svc = svc_clf.predict(test['posts'])

In [156]:
test_pred = pd.DataFrame(pred_svc)

In [157]:
test_pred

Unnamed: 0,0
0,INTJ
1,INTJ
2,INTJ
3,INTJ
4,INTJ
...,...
114737,INTP
114738,ENFP
114739,INTP
114740,INFP


# **모델저장**

In [161]:
import pickle

with open('saved_model_upgrade', 'wb') as f:
    pickle.dump(svc_clf, f)

In [162]:
accuracy_score(test_pred, Y_reshape)

0.9289710829513169

In [163]:
from sklearn.metrics import classification_report
print(classification_report(Y_reshape, test_pred))

              precision    recall  f1-score   support

        ENFJ       0.96      0.83      0.89      1724
        ENFP       0.94      0.90      0.92      6842
        ENTJ       0.97      0.89      0.93      3186
        ENTP       0.95      0.92      0.93     12410
        ESFJ       0.96      0.69      0.80       223
        ESFP       0.97      0.77      0.86       408
        ESTJ       0.99      0.92      0.95       521
        ESTP       0.99      0.96      0.97      2075
        INFJ       0.92      0.93      0.92     16433
        INFP       0.90      0.94      0.92     13966
        INTJ       0.92      0.94      0.93     23518
        INTP       0.92      0.95      0.94     26265
        ISFJ       0.97      0.80      0.87       816
        ISFP       0.95      0.79      0.86      1146
        ISTJ       0.96      0.85      0.90      1448
        ISTP       0.96      0.92      0.94      3761

    accuracy                           0.93    114742
   macro avg       0.95   