In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import spacy
from nltk.stem.snowball import SnowballStemmer

In [None]:
data = pd.read_csv('/content/drive/MyDrive/spp_project/mbti_concat.csv')

In [None]:
data

Unnamed: 0.1,Unnamed: 0,posts,type
0,0,know intj tool use interaction people excuse a...,INTJ
1,1,rap music ehh opp yeah know valid well know fa...,INTJ
2,2,preferably p hd low except wew lad video p min...,INTJ
3,3,drink like wish could drink red wine give head...,INTJ
4,4,space program ah bad deal meing freelance max ...,INTJ
...,...,...,...
114737,114737,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP
114738,114738,'So...if this thread already exists someplace ...,ENFP
114739,114739,'So many questions when i do these things. I ...,INTP
114740,114740,'I am very conflicted right now when it comes ...,INFP


In [None]:
data = data.drop(['Unnamed: 0'], axis=1)

In [None]:
data

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
114737,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP
114738,'So...if this thread already exists someplace ...,ENFP
114739,'So many questions when i do these things. I ...,INTP
114740,'I am very conflicted right now when it comes ...,INFP


## **EDA**
1. 라벨 개수 확인

In [None]:
print(f"{len(data['type'].unique())}개")

16개


2. 라벨별 비율 확인

In [None]:
data['type'].value_counts()

INTP    26265
INTJ    23518
INFJ    16433
INFP    13966
ENTP    12410
ENFP     6842
ISTP     3761
ENTJ     3186
ESTP     2075
ENFJ     1724
ISTJ     1448
ISFP     1146
ISFJ      816
ESTJ      521
ESFP      408
ESFJ      223
Name: type, dtype: int64

3. 결측치 확인

In [None]:
data.isnull().sum()

posts    0
type     0
dtype: int64

4. 데이터 중복 여부 확인

In [None]:
data['posts'].nunique() == len(data['posts'])

True

# 5. 각 MBTI 글자별 빈도수 확인

In [None]:
# E, I 빈도수 확인
first = []
for i in range(len(data)):
    first.append(data['type'][i][0])
first = pd.DataFrame(first)
first[0].value_counts()

I    87353
E    27389
Name: 0, dtype: int64

In [None]:
# N, S 빈도수 확인
second = []
for i in range(len(data)):
    second.append(data['type'][i][1])
second = pd.DataFrame(second)
second[0].value_counts()

N    104344
S     10398
Name: 0, dtype: int64

In [None]:
# T, F 빈도수 확인
third = []
for i in range(len(data)):
    third.append(data['type'][i][2])
third = pd.DataFrame(third)
third[0].value_counts()

T    73184
F    41558
Name: 0, dtype: int64

In [None]:
# P, J 빈도수 확인
fourth = []
for i in range(len(data)):
    fourth.append(data['type'][i][3])
fourth = pd.DataFrame(fourth)
fourth[0].value_counts()

P    66873
J    47869
Name: 0, dtype: int64

## **Split data**

In [None]:
#downsampling을 하기 위해서 2차원으로 변경
X_data = data['posts'] # data features
y_data = data['type'] # labels
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1) # test size = 0.3도 해보기

train, test 행렬 확인

In [None]:
print("X_train data : ", X_train.shape)
print("y_train data : ", y_train.shape)
print("X_test data : ", X_test.shape)
print("y_test data : ", y_test.shape)

X_train data :  (73434,)
y_train data :  (73434,)
X_test data :  (22949,)
y_test data :  (22949,)


In [None]:
y_train.value_counts()

INTP    16836
INTJ    15084
INFJ    10449
INFP     8949
ENTP     7909
ENFP     4417
ISTP     2427
ENTJ     2064
ESTP     1302
ENFJ     1061
ISTJ      929
ISFP      718
ISFJ      541
ESTJ      327
ESFP      273
ESFJ      148
Name: type, dtype: int64

## **TF-IDF Vectorizer**

*   TF(Term Frequency) : 특정 단어가 등장하는 횟수
*   IDF(Inverse Document Frequency) : 특정 단어가 몇 개의 Document에서 등장하는지의 역수


*   TF-IDF = TF*IDF



In [None]:
# 벡터화
tfidf = TfidfVectorizer()

# 훈련 데이터 벡터화
X_train_tfidf = tfidf.fit_transform(X_train)

Grid Search

In [None]:
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)
# 정확도 기준 설정
cv = GridSearchCV(clf, {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1.0]}, scoring = "accuracy")

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',cv)])
text_clf.fit(X_train, y_train)

C = cv.best_estimator_.C

In [None]:
print("최적의 파라미터 C : ", C)

최적의 파라미터 C :  0.4


2. Validation Accuracy 확인

In [None]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=0.4))])
text_clf.fit(X_train, y_train)

In [None]:
# valid 데이터의 mbti 예측
pred = text_clf.predict(X_valid)

In [None]:
# valid data에서의 정확도
accuracy_score(pred, y_valid)

0.8295658804945804

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, pred))

              precision    recall  f1-score   support

        ENFJ       0.87      0.56      0.68       305
        ENFP       0.84      0.75      0.79      1073
        ENTJ       0.90      0.74      0.81       462
        ENTP       0.85      0.81      0.83      2018
        ESFJ       0.86      0.38      0.52        32
        ESFP       0.91      0.47      0.62        62
        ESTJ       0.98      0.70      0.82        76
        ESTP       0.96      0.87      0.92       347
        INFJ       0.81      0.83      0.82      2669
        INFP       0.78      0.85      0.81      2283
        INTJ       0.83      0.88      0.85      3648
        INTP       0.83      0.89      0.86      4230
        ISFJ       0.79      0.50      0.61       128
        ISFP       0.77      0.50      0.60       187
        ISTJ       0.87      0.59      0.70       240
        ISTP       0.89      0.75      0.82       599

    accuracy                           0.83     18359
   macro avg       0.86   

In [None]:
y_valid_list = list(y_valid)

In [None]:
num = 0
for i in range(len(y_valid)):
    if y_valid_list[i][0] == pred[i][0]:
        num+=1
print("첫 번째 글자의 정확도 : ", num/len(y_valid))


첫 번째 글자의 정확도 :  0.9318590337164333


In [None]:
num = 0
for i in range(len(y_valid)):
    if y_valid_list[i][1] == pred[i][1]:
        num+=1
print("두 번째 글자의 정확도 : ", num/len(y_valid))

두 번째 글자의 정확도 :  0.9672640122011003


In [None]:
num = 0
for i in range(len(y_valid)):
    if y_valid_list[i][2] == pred[i][2]:
        num+=1
print("세 번째 글자의 정확도 : ", num/len(y_valid))

세 번째 글자의 정확도 :  0.9327850100768016


In [None]:
num = 0
for i in range(len(y_valid)):
    if y_valid_list[i][3] == pred[i][3]:
        num+=1
print("네 번째 글자의 정확도 : ", num/len(y_valid))

네 번째 글자의 정확도 :  0.9039163353123808


In [None]:
# 모든 설명변수 데이터 X 자연어처리
X_tfidf = tfidf.fit_transform(X_test)

In [None]:
clf = LinearSVC()
clf.fit(X_tfidf, y_test)

In [None]:
svc_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=0.4))])
svc_clf.fit(X, y)

In [None]:
pred_svc = svc_clf.predict(X_test)

In [None]:
test_pred = pd.DataFrame(pred_svc)

In [None]:
test_pred

Unnamed: 0,0
0,INTP
1,INFJ
2,INFP
3,ISTP
4,INFJ
...,...
22944,INFJ
22945,INFP
22946,ESTP
22947,INFJ


In [None]:
accuracy_score(test_pred, y_test)

0.8343282931718158

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

        ENFJ       0.86      0.62      0.72       358
        ENFP       0.83      0.76      0.80      1352
        ENTJ       0.93      0.74      0.82       660
        ENTP       0.86      0.82      0.84      2483
        ESFJ       0.85      0.26      0.39        43
        ESFP       0.77      0.41      0.54        73
        ESTJ       0.95      0.79      0.86       118
        ESTP       0.96      0.89      0.93       426
        INFJ       0.82      0.84      0.83      3315
        INFP       0.79      0.85      0.82      2734
        INTJ       0.83      0.87      0.85      4786
        INTP       0.83      0.89      0.86      5199
        ISFJ       0.79      0.53      0.63       147
        ISFP       0.86      0.51      0.64       241
        ISTJ       0.86      0.56      0.68       279
        ISTP       0.89      0.79      0.83       735

    accuracy                           0.83     22949
   macro avg       0.85   

# **모델저장**

In [None]:
import pickle

with open('saved_model_ver2', 'wb') as f:
    pickle.dump(svc_clf, f)