In [48]:
import pandas as pd
import numpy as np
import re
from konlpy.tag import Okt
from konlpy.tag import Kkma  
# from gluonnlp.data import SentencepieceTokenizer
from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer
# from kobert.utils import get_tokenizer
from gensim.models import Word2Vec
from scipy.spatial import distance
from tqdm import tqdm

In [49]:
df = pd.read_csv('data/data_josa_removed.csv', index_col=0)

In [50]:
df.head(2)

Unnamed: 0,qplay_question_id,text,qtid,name,subtitle,question_difficulty,question_type_difficulty,grade,purpose
0,3504,"['연립부등식', '`', '{', '(', '-', 'x', '-3', '>', ...",H1S1-08-05,해를 갖거나 갖지 않는 연립일차부등식,,3,2.0,10,open
1,3675,"['부등식', '`', '[', 'x', ']', '^', '2', '-2', '[...",H1S1-08-11,절댓값 또는 가우스 기호가 포함된 이차부등식,,3,2.0,10,open


In [51]:
# 파일로 저장되어서 텍스트로 바뀐 리스트를 다시 리스트 형태로 변환
def text_to_list(x):
    x = re.sub("^\[|\]$|\'", '', x)
    x = x.split(', ')
    return x

In [52]:
df['text'] = df['text'].apply(text_to_list)
df['text']

0       [연립부등식, `, {, (, -, x, -3, >, =, -2, x, +, 1, ...
1       [부등식, `, [, x, ], ^, 2, -2, [, x, ], -3, <, 0,...
2       [점, `, (, 1, ,, 2, ), `, 지나, 중심, 직선, `, x, -, ...
3       [`, x, `,, `, y, `, 정수일, 때, ,, 방정식, `, xy, +, ...
4       [평행이동, `, (, x, ,, y, ), ->, (, x, +, 2, ,, y,...
                              ...                        
5359    [두, 사건, `, A, `,, `, B, `, 대하, `, P, (, A, uu,...
5360    [집합, `, X, =, {, 1, ,, 2, ,, 3, ,, 4, ,, 5, },...
5361    [실수, 전체, 집합, 정의된, 함수, `, f, (, x, ), `, 다음, 조건...
5362    [실수, `, x, `, 대, 두, 조건, `, p, `,, `, q, `, 다음,...
5363    [같, 종류, 비어, 있, 상자, `, 3, `, 개, 있, ., 같, 종류, 장난...
Name: text, Length: 5364, dtype: object

In [53]:
# Make chapter with qtid
df['chapter'] = 0
df['chapter'] = df['qtid'].apply(lambda x: re.sub('-[0-9]{2}$', '', x))
df['chapter']

0       H1S1-08
1       H1S1-08
2       H1S1-11
3       H1S1-07
4       H1S1-12
         ...   
5359    HSTA-03
5360    H1S2-07
5361    H1S2-04
5362    H1S2-03
5363    H1S2-07
Name: chapter, Length: 5364, dtype: object

### Word2vec 적용

In [75]:
tokens = df['text']
v_dimension = 500
v_window = 50

model = Word2Vec(sentences = tokens.tolist(), 
                 size = v_dimension, 
                 window = v_window, 
                 min_count = 2, 
                 workers = 4, 
                 sg = 0)

In [76]:
# Get question vectors without normalization

def all_question_embedding(df_tokens):
    vectors = []
    for tokens in df_tokens:
        init_v = np.array([0.0]*v_dimension)  # Set empty array
        for token in tokens:
            word_vectors = model.wv  # Get word2vec model dictionary
            if token in word_vectors.vocab: # Add word to question vector if word is in w2v model dictionary
                v = model.wv[token]
                init_v = init_v + v
        vectors.append(init_v)

    frame = {'chapter': df['chapter'].tolist(), 'vector': vectors }
    result = pd.DataFrame(frame)
    return result

In [77]:
# 

def single_question_embedding(tokens):
    # Vectorization with word2vec
    init_v = np.array([0.0]*v_dimension)
    for word in tokens:
        word_vectors = model.wv
        if word in word_vectors.vocab:
            v = model.wv[word]
            init_v = init_v + v
    return init_v

In [78]:
# Make mid-chapter vectors adding sentence vectors

def chapter_embedding(df):
    chapter_list = []
    vector_list = []
    for chapter in df['chapter'].unique():
        temp = df.loc[df['chapter'] == chapter]
        add_v = np.array([0.0]*v_dimension)
        for vec in temp['vector']:
            add_v = add_v + vec
        chapter_list.append(chapter)
        vector_list.append(add_v)
        
    frame = {'chapter': chapter_list, 'vector': vector_list }
    result = pd.DataFrame(frame)
    
    return result

In [79]:
all_question_embedded = all_question_embedding(tokens)
all_question_embedded

Unnamed: 0,chapter,vector
0,H1S1-08,"[-40.01940703764558, -7.339770053979009, -15.3..."
1,H1S1-08,"[-21.170333474874496, -4.7475149761885405, -9...."
2,H1S1-11,"[-26.97206992097199, -13.806408300995827, -7.4..."
3,H1S1-07,"[-26.23356848023832, -9.24483785033226, -9.552..."
4,H1S1-12,"[-54.815728230401874, -11.069866370409727, -18..."
...,...,...
5359,HSTA-03,"[-44.88021703064442, -6.145475876517594, -12.5..."
5360,H1S2-07,"[-53.60716703440994, -17.968863031826913, -29...."
5361,H1S2-04,"[-232.08759104227647, -50.72354426421225, -96...."
5362,H1S2-03,"[-56.04652937268838, -11.016352257225662, -22...."


In [80]:
chapter_embedded = chapter_embedding(all_question_embedded)
chapter_embedded.head(2)

Unnamed: 0,chapter,vector
0,H1S1-08,"[-7484.95755327685, -1482.4981939341233, -3017..."
1,H1S1-11,"[-5304.818932564667, -1741.5109325184021, -145..."


In [81]:
def calculate_cosinesim(question):
    temp = single_question_embedding(question)
    result = []
    # Calculate cosine simmilarity between question and every chapters
    for chapter_vector in chapter_embedded['vector']:
        cosine = 1 - distance.cosine(chapter_vector, temp)
        result.append(cosine)
    # Make output dataframe
    df = pd.DataFrame(data=np.zeros([len(chapter_embedded),2]), columns=['chapter', 'result'])
    df['chapter'] = chapter_embedded['chapter']
    df['result'] = result

    df.sort_values('result', ascending=False, inplace=True)
    return df

In [82]:
calculate_cosinesim(df.loc[7, 'text']).head(3)

Unnamed: 0,chapter,result
2,H1S1-07,0.891501
9,H1S1-05,0.887569
0,H1S1-08,0.874019


### Model apply

In [83]:
# Apply function to whole dataset

chapter_list = []
cosine_list = []
for i in tqdm(range(len(df))):
    result = calculate_cosinesim(df.loc[i, 'text'])
    result = result.sort_values(by='result', ascending=False).reset_index(drop=True)
    chapter = result['chapter'][0] # Get the most similar chapter with question
    cosine = result['result'][0]
    chapter_list.append(chapter)
    cosine_list.append(cosine)
    
# Concatenate predict result with dataframe
pred = pd.DataFrame(list(zip(chapter_list, cosine_list)), columns = ['pred_chapter', 'pred_similarity'])
pred.head(10)

100%|██████████| 5364/5364 [00:15<00:00, 356.21it/s]


Unnamed: 0,pred_chapter,pred_similarity
0,H1S1-08,0.974793
1,H1S1-08,0.892236
2,H1S1-12,0.964719
3,H1S1-06,0.958946
4,H1S1-06,0.952798
5,H1S1-10,0.971247
6,H1S1-09,0.949506
7,H1S1-07,0.891501
8,H1S1-08,0.956248
9,H1S1-09,0.946204


### Evaluation

In [84]:
# Evaluate predict score with accuracy

from sklearn.metrics import accuracy_score
score = accuracy_score(df['chapter'], pred['pred_chapter'])
print(score)

0.39858314690529456


In [85]:
pd.concat([df['chapter'], pred], axis=1).head(10)

Unnamed: 0,chapter,pred_chapter,pred_similarity
0,H1S1-08,H1S1-08,0.974793
1,H1S1-08,H1S1-08,0.892236
2,H1S1-11,H1S1-12,0.964719
3,H1S1-07,H1S1-06,0.958946
4,H1S1-12,H1S1-06,0.952798
5,H1S1-10,H1S1-10,0.971247
6,H1S1-09,H1S1-09,0.949506
7,H1S1-03,H1S1-07,0.891501
8,H1S1-08,H1S1-08,0.956248
9,H1S1-11,H1S1-09,0.946204


정확도는 0.4 정도로 떨어지는 편이지만, 중단원 이전에 어떤 과정인지는 거의 대부분 맞추고 있으며,  
중단원을 틀린 경우도 한두단원 전후의 비슷한 중단원을 예측하고 있음.  
한두단원 차이를 좀 더 정확하게 예측할 수 있게 하는 작업이 좀 더 필요할 듯 보임.