## **데이터 로드**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import spacy
from nltk.stem.snowball import SnowballStemmer

In [4]:
data = pd.read_csv('/content/drive/MyDrive/spp_project/data_result.csv')

In [5]:
data

Unnamed: 0.1,Unnamed: 0,posts,type
0,0,know tool use interaction people excuse antiso...,INTJ
1,1,rap music ehh opp yeah know valid well know fa...,INTJ
2,2,preferably hd low except wew lad video mind go...,INTJ
3,3,drink like wish could drink red wine give head...,INTJ
4,4,space program ah bad deal meing freelance max ...,INTJ
...,...,...,...
98161,98161,refuel ne motivation general nature mind bui n...,INFP
98162,98162,confidence btw loon oh god absolutely also str...,INFP
98163,98163,person see actually help build confidence able...,INFP
98164,98164,eat good come lose weight lonerm excercise sep...,INFP


In [6]:
data = data.drop(['Unnamed: 0'], axis=1)

In [7]:
data= data.dropna(axis=0)

In [8]:
data

Unnamed: 0,posts,type
0,know tool use interaction people excuse antiso...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably hd low except wew lad video mind go...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
98160,much interest activity outside school gymnasti...,INFP
98161,refuel ne motivation general nature mind bui n...,INFP
98162,confidence btw loon oh god absolutely also str...,INFP
98163,person see actually help build confidence able...,INFP


## **Encoding**

In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# SMOTE를 사용하기 위해서 예측 변수, 설명 변수 모두 인코딩
encoder_X = OneHotEncoder()
encoded_X = encoder_X.fit_transform(data['posts'].to_numpy().reshape(-1, 1))
encoder_y = LabelEncoder()
encoded_y = encoder_y.fit_transform(data['type'].to_numpy().reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [11]:
encoded_y

array([10, 10, 10, ...,  9,  9,  9])

In [12]:
unique_array=np.unique(encoded_y)

print(unique_array)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


## **SMOTE 기법 적용: 클래스 불균형 해소**

In [13]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [51]:
import sklearn
from imblearn.over_sampling import SMOTE

# SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(encoded_X, encoded_y)

## **Decoding**

In [15]:
# 인코딩한 변수들을 다시 문자열로 디코딩하는 함수 정의
def text_inverse_transform_X(encoded_data, encoder):
    decoded_data = encoder.inverse_transform(encoded_data)
    return decoded_data

def text_inverse_transform_y(encoded_data, encoder):
    decoded_data = encoder.inverse_transform(encoded_data)
    return decoded_data

In [16]:
# 예측 변수, 설명 변수 디코딩
X_data = text_inverse_transform_X(X_resampled, encoder_X)
y_data = text_inverse_transform_y(y_resampled, encoder_y)

In [17]:
y_data

array(['INTJ', 'INTJ', 'INTJ', ..., 'ISTP', 'ISTP', 'ISTP'], dtype=object)

In [18]:
X_data

array([['know tool use interaction people excuse antisocial truly enlighten mastermind know would count pet peeze something time matter people either whether group people mall never see best friend sit outside conversation jsut listen want interject sit formulate say wait inject argument thought find fascinate sit watch people talk people fascinate sit class watch different people find intrigue dad stand look like line safeway watch people home talk people like think military job people voluntarily go job important show deference endanger live glorify way civilian think pretty ignorant general think military necessary defense mechanism political tactic feel like specifically invest much money could put money education whatnot though personally sound budget aernative really comment one way base two politician eye year ago come name somewhat important kinda role model nowadays pick keep score individual level mean little vary accord number condition day may score high others low sweat re

## **data split**

In [21]:
# 변수가 바이트 형식이라면 utf-8로 디코딩
X_decoded = [x.decode('utf-8') if isinstance(x, bytes) else x for x in X_data]
y_decoded = [y.decode('utf-8') if isinstance(y, bytes) else y for y in y_data]

In [22]:
X, X_test, y, y_test = train_test_split(X_decoded, y_decoded, test_size=0.2, random_state=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1) # test size = 0.3도 해보기

## **TF-IDF Vectorizer**

*   TF(Term Frequency) : 특정 단어가 등장하는 횟수
*   IDF(Inverse Document Frequency) : 특정 단어가 몇 개의 Document에서 등장하는지의 역수


*   TF-IDF = TF*IDF



In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 벡터화
tfidf = TfidfVectorizer(lowercase=False)


# 문자열로 변환->X_train를 그대로 tfidfVectorizer로 벡터화하면 에러 발생
X_train_str = [str(x) for x in X_train]

# 훈련 데이터 벡터화
X_train_tfidf = tfidf.fit_transform(X_train_str)

## **Grid SearchCV**

In [26]:
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)
# grid search를 이용해 최적의 하이퍼 파라미터 찾기
cv = GridSearchCV(clf, {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1.0]}, scoring = "accuracy")

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',cv)])
text_clf.fit(X_train_str, y_train)

C = cv.best_estimator_.C

In [27]:
print("최적의 파라미터 C : ", C)

최적의 파라미터 C :  1.0


## **모델 학습 및 Validation data 예측**

In [29]:
#grid search를 통해 찾은 최적의 하이퍼 파라미터 적용
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=1.0))])
text_clf.fit(X_train_str, y_train)

In [31]:
X_valid_str = [str(x) for x in X_valid]  # Convert to strings

In [32]:
# valid 데이터의 mbti 예측
pred = text_clf.predict(X_valid_str)

In [33]:
# valid data에서의 정확도
accuracy_score(pred, y_valid)

0.938636577044665

In [34]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, pred))

              precision    recall  f1-score   support

        ENFJ       0.98      1.00      0.99      4029
        ENFP       0.93      0.93      0.93      4080
        ENTJ       0.97      0.99      0.98      3885
        ENTP       0.89      0.89      0.89      3940
        ESFJ       1.00      1.00      1.00      3927
        ESFP       1.00      1.00      1.00      4078
        ESTJ       1.00      1.00      1.00      4013
        ESTP       0.99      1.00      1.00      3916
        INFJ       0.87      0.84      0.85      4026
        INFP       0.93      0.96      0.94      4038
        INTJ       0.76      0.73      0.74      3954
        INTP       0.75      0.70      0.73      4040
        ISFJ       0.99      1.00      0.99      4052
        ISFP       0.99      1.00      0.99      3942
        ISTJ       0.98      1.00      0.99      3994
        ISTP       0.96      0.98      0.97      3984

    accuracy                           0.94     63898
   macro avg       0.94   

## **Test data 예측**

In [43]:
X_str = [str(x) for x in X]  # Convert to strings

In [40]:
X_test_str = [str(x) for x in X_test]  # Convert to strings

In [41]:
# 모든 설명변수 데이터 X 자연어처리
X_tfidf = tfidf.fit_transform(X_test_str)

In [42]:
clf = LinearSVC()
clf.fit(X_tfidf, y_test)

In [44]:
svc_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=1.0))])
svc_clf.fit(X_str, y) # train/valid data 합쳐서 학습, test data로 예측

In [45]:
pred_svc = svc_clf.predict(X_test_str)

In [46]:
test_pred = pd.DataFrame(pred_svc)

In [47]:
test_pred

Unnamed: 0,0
0,ESFJ
1,ESTJ
2,ISTP
3,ENTJ
4,INFP
...,...
79867,ISTP
79868,ESFP
79869,ESFJ
79870,ESFJ


In [48]:
accuracy_score(test_pred, y_test)

0.94384765625

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

        ENFJ       0.98      1.00      0.99      4902
        ENFP       0.94      0.95      0.95      4977
        ENTJ       0.97      0.99      0.98      5060
        ENTP       0.90      0.90      0.90      5035
        ESFJ       1.00      1.00      1.00      4989
        ESFP       1.00      1.00      1.00      4883
        ESTJ       1.00      1.00      1.00      5109
        ESTP       0.99      1.00      1.00      5011
        INFJ       0.89      0.85      0.87      5010
        INFP       0.94      0.97      0.95      4973
        INTJ       0.78      0.72      0.75      4949
        INTP       0.76      0.72      0.74      4960
        ISFJ       0.99      1.00      1.00      4950
        ISFP       0.99      1.00      0.99      4929
        ISTJ       0.99      1.00      0.99      5103
        ISTP       0.96      0.99      0.98      5032

    accuracy                           0.94     79872
   macro avg       0.94   

# **모델저장**

In [50]:
import pickle

with open('saved_model_result', 'wb') as f:
    pickle.dump(svc_clf, f)