In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import spacy
from nltk.stem.snowball import SnowballStemmer

In [3]:
train = pd.read_csv('/content/drive/MyDrive/spp_project/MBTI.csv')

In [4]:
test = train.drop(['type'], axis=1)
test.head()

Unnamed: 0,posts
0,know intj tool use interaction people excuse a...
1,rap music ehh opp yeah know valid well know fa...
2,preferably p hd low except wew lad video p min...
3,drink like wish could drink red wine give head...
4,space program ah bad deal meing freelance max ...


In [5]:
# 설명변수
X = train['posts']
# 예측변수
Y = train['type']

## **EDA**
1. 라벨 개수 확인

In [6]:
print(f"{len(train['type'].unique())}개")

16개


2. 라벨별 비율 확인

In [7]:
train['type'].value_counts()

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: type, dtype: int64

3. 결측치 확인

In [8]:
train.isnull().sum()

posts    0
type     0
dtype: int64

4. 데이터 중복 여부 확인

In [9]:
train['posts'].nunique() == len(train['posts'])

True

5. train, test 행렬 확인

In [10]:
print("train data : ", train.shape)
print("test data : ", test.shape)

train data :  (106067, 2)
test data :  (106067, 1)


6. 각 MBTI 글자별 빈도수 확인

In [11]:
# E, I 빈도수 확인
first = []
for i in range(len(train)):
    first.append(train['type'][i][0])
first = pd.DataFrame(first)
first[0].value_counts()

I    80677
E    25390
Name: 0, dtype: int64

In [12]:
# N, S 빈도수 확인
second = []
for i in range(len(train)):
    second.append(train['type'][i][1])
second = pd.DataFrame(second)
second[0].value_counts()

N    96866
S     9201
Name: 0, dtype: int64

In [13]:
# T, F 빈도수 확인
third = []
for i in range(len(train)):
    third.append(train['type'][i][2])
third = pd.DataFrame(third)
third[0].value_counts()

T    69203
F    36864
Name: 0, dtype: int64

In [14]:
# P, J 빈도수 확인
fourth = []
for i in range(len(train)):
    fourth.append(train['type'][i][3])
fourth = pd.DataFrame(fourth)
fourth[0].value_counts()

P    61632
J    44435
Name: 0, dtype: int64

## **Split data**

In [15]:
# X = train['posts'] # data features
# Y = train['type'] # labels
# X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=1) # test size = 0.3도 해보기

## **불용어 제거, 표제어 추출**

In [21]:
# stopWord 확인
nlp = spacy.load('en_core_web_sm')
print("stopword : ", len(nlp.Defaults.stop_words))
def removeStopwords(s):
  new = []
  for word in s.split():
    if not nlp.vocab[word].is_stop:
      new.append(word)
  return ' '.join(new)

stopword :  326


In [22]:
# Lemmatization(표제어 추출 : 단어로부터 표제어 찾기)

s_stemmer = SnowballStemmer(language='english')
def replaceStemwords(s):
  new = []
  for word in s.split():
    new.append(s_stemmer.stem(word))
  return ' '.join(new)

In [23]:
tqdm.pandas()

In [24]:
train

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
106062,stay frustrate world life want take long nap w...,INFP
106063,fizzle around time mention sure mistake thing ...,INFP
106064,schedule modify hey w intp strong wing underst...,INFP
106065,enfj since january busy schedule able spend li...,INFP


In [25]:
train.posts = train.posts.progress_apply(lambda x: removeStopwords(replaceStemwords(x)))

  0%|          | 0/106067 [00:00<?, ?it/s]

In [26]:
# test data에도 적용
test.posts = test.posts.progress_apply(lambda x: removeStopwords(replaceStemwords(x)))

  0%|          | 0/106067 [00:00<?, ?it/s]

In [27]:
X = train.posts
Y= train.type

In [36]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=1)

In [37]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC(C=0.3))])
text_clf.fit(X_train, Y_train)

In [38]:
# valid 데이터의 mbti 예측
pred = text_clf.predict(X_valid)

In [39]:
# valid data에서의 정확도
accuracy_score(pred, Y_valid)

0.8377015178655605