In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import spacy
from nltk.stem.snowball import SnowballStemmer

In [None]:
train = pd.read_csv('/content/drive/MyDrive/spp_project/MBTI.csv')

In [None]:
train.head()

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [None]:
train.tail()

Unnamed: 0,posts,type
106062,stay frustrate world life want take long nap w...,INFP
106063,fizzle around time mention sure mistake thing ...,INFP
106064,schedule modify hey w intp strong wing underst...,INFP
106065,enfj since january busy schedule able spend li...,INFP
106066,feel like men good problem tell parent want te...,INFP


In [None]:
train

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
106062,stay frustrate world life want take long nap w...,INFP
106063,fizzle around time mention sure mistake thing ...,INFP
106064,schedule modify hey w intp strong wing underst...,INFP
106065,enfj since january busy schedule able spend li...,INFP


In [None]:
test = train.drop(['type'], axis=1)
test.head()

Unnamed: 0,posts
0,know intj tool use interaction people excuse a...
1,rap music ehh opp yeah know valid well know fa...
2,preferably p hd low except wew lad video p min...
3,drink like wish could drink red wine give head...
4,space program ah bad deal meing freelance max ...


In [None]:
# 설명변수
X = train['posts']
# 예측변수
Y = train['type']

## **EDA**
1. 라벨 개수 확인

In [None]:
print(f"{len(train['type'].unique())}개")

16개


2. 라벨별 비율 확인

In [None]:
train['type'].value_counts()

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: type, dtype: int64

In [None]:
# plt.pie(train['type'])

3. 결측치 확인

In [None]:
train.isnull().sum()

posts    0
type     0
dtype: int64

4. 데이터 중복 여부 확인

In [None]:
train['posts'].nunique() == len(train['posts'])

True

5. train, test 행렬 확인

In [None]:
print("train data : ", train.shape)
print("test data : ", test.shape)

train data :  (106067, 2)
test data :  (106067, 1)


6. 각 MBTI 글자별 빈도수 확인

In [None]:
# E, I 빈도수 확인
first = []
for i in range(len(train)):
    first.append(train['type'][i][0])
first = pd.DataFrame(first)
first[0].value_counts()

I    80677
E    25390
Name: 0, dtype: int64

In [None]:
# N, S 빈도수 확인
second = []
for i in range(len(train)):
    second.append(train['type'][i][1])
second = pd.DataFrame(second)
second[0].value_counts()

N    96866
S     9201
Name: 0, dtype: int64

In [None]:
# T, F 빈도수 확인
third = []
for i in range(len(train)):
    third.append(train['type'][i][2])
third = pd.DataFrame(third)
third[0].value_counts()

T    69203
F    36864
Name: 0, dtype: int64

In [None]:
# P, J 빈도수 확인
fourth = []
for i in range(len(train)):
    fourth.append(train['type'][i][3])
fourth = pd.DataFrame(fourth)
fourth[0].value_counts()

P    61632
J    44435
Name: 0, dtype: int64

## **Split data**

In [None]:
X = train['posts'] # data features
Y = train['type'] # labels
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=1) # test size = 0.3도 해보기

## **TF-IDF Vectorizer**

*   TF(Term Frequency) : 특정 단어가 등장하는 횟수
*   IDF(Inverse Document Frequency) : 특정 단어가 몇 개의 Document에서 등장하는지의 역수


*   TF-IDF = TF*IDF



In [None]:
# 벡터화
tfidf = TfidfVectorizer()

# 훈련 데이터 벡터화
X_train_tfidf = tfidf.fit_transform(X_train)

Grid Search

In [None]:
clf = LinearSVC()
# 정확도 기준 설정
cv = GridSearchCV(clf, {'C': [0.1, 0.3, 0.5, 1.0]}, scoring = "accuracy")

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',cv)])
text_clf.fit(X_train, y_train)

C = cv.best_estimator_.C

In [None]:
print("최적의 파라미터 C : ", C)

2. Validation Accuracy 확인

In [None]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=0.3))])
text_clf.fit(X_train, y_train)

In [None]:
# valid 데이터의 mbti 예측
pred = text_clf.predict(X_valid)

In [None]:
# valid data에서의 정확도
accuracy_score(pred, y_valid)