In [2]:
import pandas as pd
import numpy as np
import re
from konlpy.tag import Okt
from konlpy.tag import Kkma  
from gluonnlp.data import SentencepieceTokenizer
from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer
from kobert.utils import get_tokenizer
from gensim.models import Word2Vec
from scipy.spatial import distance
from tqdm import tqdm

In [3]:
DATA_PATH = './data'

In [29]:
DATA = pd.read_csv(DATA_PATH+'/data_josa_removed.csv', index_col=0)

In [4]:
df = pd.read_csv(DATA_PATH+'/data_josa_removed.csv', index_col=0)

In [5]:
df.head(2)

Unnamed: 0,qplay_question_id,text,qtid,name,subtitle,question_difficulty,question_type_difficulty,grade,purpose
0,3504,"['연립부등식', '`', '{', '(', '-', 'x', '-3', '>', ...",H1S1-08-05,해를 갖거나 갖지 않는 연립일차부등식,,3,2.0,10,open
1,3675,"['부등식', '`', '[', 'x', ']', '^', '2', '-2', '[...",H1S1-08-11,절댓값 또는 가우스 기호가 포함된 이차부등식,,3,2.0,10,open


### Preprocess

In [6]:
# 파일로 저장되어서 텍스트로 바뀐 리스트를 다시 리스트 형태로 변환
def text_to_list(x):
    x = re.sub("^\[|\]$|\'", '', x)
    x = x.split(', ')
    return x

In [33]:
DATA['text'] = DATA['text'].apply(text_to_list)
DATA['text']

0       [연립부등식, `, {, (, -, x, -3, >, =, -2, x, +, 1, ...
1       [부등식, `, [, x, ], ^, 2, -2, [, x, ], -3, <, 0,...
2       [점, `, (, 1, ,, 2, ), `, 지나, 중심, 직선, `, x, -, ...
3       [`, x, `,, `, y, `, 정수일, 때, ,, 방정식, `, xy, +, ...
4       [평행이동, `, (, x, ,, y, ), ->, (, x, +, 2, ,, y,...
                              ...                        
5359    [두, 사건, `, A, `,, `, B, `, 대하, `, P, (, A, uu,...
5360    [집합, `, X, =, {, 1, ,, 2, ,, 3, ,, 4, ,, 5, },...
5361    [실수, 전체, 집합, 정의된, 함수, `, f, (, x, ), `, 다음, 조건...
5362    [실수, `, x, `, 대, 두, 조건, `, p, `,, `, q, `, 다음,...
5363    [같, 종류, 비어, 있, 상자, `, 3, `, 개, 있, ., 같, 종류, 장난...
Name: text, Length: 5364, dtype: object

In [34]:
# Make chapter with qtid
DATA['chapter'] = 0
DATA['chapter'] = DATA['qtid'].apply(lambda x: re.sub('-[0-9]{2}$', '', x))
DATA['chapter']

0       H1S1-08
1       H1S1-08
2       H1S1-11
3       H1S1-07
4       H1S1-12
         ...   
5359    HSTA-03
5360    H1S2-07
5361    H1S2-04
5362    H1S2-03
5363    H1S2-07
Name: chapter, Length: 5364, dtype: object

### Make train, test set

In [9]:
#from sklearn.model_selection import train_test_split

#df, df_test = train_test_split(df, test_size=0.2)

### Word2vec 적용

In [10]:
tokens = df['text']
v_dimension = 500
v_window = 50

model = Word2Vec(sentences = tokens.tolist(), 
                 size = v_dimension, 
                 window = v_window, 
                 min_count = 2, 
                 workers = 4, 
                 sg = 0)

In [11]:
# Get question vectors without normalization

def all_question_embedding(df_tokens):
    vectors = []
    for tokens in df_tokens:
        init_v = np.array([0.0]*v_dimension)  # Set empty array
        for token in tokens:
            word_vectors = model.wv  # Get word2vec model dictionary
            if token in word_vectors.vocab: # Add word to question vector if word is in w2v model dictionary
                v = model.wv[token]
                init_v = init_v + v
        vectors.append(init_v)

    frame = {'name': df['name'].tolist(), 'vector': vectors }
    result = pd.DataFrame(frame)
    return result

In [12]:
# 

def single_question_embedding(tokens):
    # Vectorization with word2vec
    init_v = np.array([0.0]*v_dimension)
    for word in tokens:
        word_vectors = model.wv
        if word in word_vectors.vocab:
            v = model.wv[word]
            init_v = init_v + v
    return init_v

In [13]:
# Make mid-chapter vectors adding sentence vectors

def name_embedding(df):
    name_list = []
    vector_list = []
    for name in df['name'].unique():
        temp = df.loc[df['name'] == name]
        add_v = np.array([0.0]*v_dimension)
        for vec in temp['vector']:
            add_v = add_v + vec
        name_list.append(name)
        vector_list.append(add_v)
        
    frame = {'name': name_list, 'vector': vector_list }
    result = pd.DataFrame(frame)
    
    return result

In [14]:
all_question_embedded = all_question_embedding(tokens)
all_question_embedded

Unnamed: 0,name,vector
0,해를 갖거나 갖지 않는 연립일차부등식,"[18.834240006282926, -6.196072686463594, 13.93..."
1,절댓값 또는 가우스 기호가 포함된 이차부등식,"[7.836369529366493, -1.0413185358047485, 3.543..."
2,x축 또는 y축에 접하는 원의 방정식,"[18.35009165853262, -11.0173349827528, 20.1296..."
3,정수 및 자연수 조건을 갖는 부정방정식,"[13.213709262898192, -4.719159467145801, 13.90..."
4,도형의 평행이동 (2),"[39.9215270280838, -15.264107968658209, 38.037..."
...,...,...
5359,확률의 덧셈정리 (1),"[17.424915118142962, -7.336826055310667, 28.67..."
5360,함수의 개수,"[9.911998490802944, -0.42539719690103084, 16.7..."
5361,함수의 그래프와 합성함수,"[94.96395476534963, -27.49848104827106, 77.660..."
5362,명제가 참이 되도록 하는 상수 구하기,"[33.61963940411806, -1.8944173408672214, 35.62..."


In [15]:
name_embedded = name_embedding(all_question_embedded)
name_embedded.head(2)

Unnamed: 0,name,vector
0,해를 갖거나 갖지 않는 연립일차부등식,"[181.61575108859688, -90.54577681003138, 131.3..."
1,절댓값 또는 가우스 기호가 포함된 이차부등식,"[104.59947786293924, -8.647887026425451, 55.55..."


In [16]:
def calculate_cosinesim(question):
    temp = single_question_embedding(question)
    result = []
    # Calculate cosine simmilarity between question and every names
    for name_vector in name_embedded['vector']:
        cosine = 1 - distance.cosine(name_vector, temp)
        result.append(cosine)
    # Make output dataframe
    df = pd.DataFrame(data=np.zeros([len(name_embedded),2]), columns=['name', 'result'])
    df['name'] = name_embedded['name']
    df['result'] = result

    df.sort_values('result', ascending=False, inplace=True)
    return df

### Model apply

In [30]:
# Apply function to whole dataset
def apply_w2v_name(df):
    name_list = []
    cosine_list = []
    for i in tqdm(df.index):
        result = calculate_cosinesim(df.loc[i, 'text'])
        result = result.sort_values(by='result', ascending=False).reset_index(drop=True)
        name = result['name'][0] # Get the most similar name with question
        cosine = result['result'][0]
        name_list.append(name)
        cosine_list.append(cosine)

    # Concatenate predict result with dataframe
    pred = pd.DataFrame(list(zip(name_list, cosine_list)), columns = ['pred_name', 'pred_similarity'])
    pred.index = df.index
    pred['pred_chapter'] = 0
    for i in df.index:
        pred.loc[i, 'pred_chapter'] = DATA[DATA['name']==pred.loc[i, 'pred_name']]['chapter'].unique()[-1]
    
    return pred

In [25]:
pred = apply_w2v_name(df)
pred.head(2)

100%|██████████| 5364/5364 [01:38<00:00, 54.55it/s]


Unnamed: 0,pred_name,pred_similarity,pred_chapter
0,연립이차부등식,0.992603,H1S1-08
1,이차부등식,0.950958,H1S1-08


### Evaluation

In [22]:
# Evaluate predict score with accuracy

from sklearn.metrics import accuracy_score
score = accuracy_score(df['chapter'], pred['pred_chapter'])
print(score)

0.6260253542132737


In [23]:
pd.concat([df['chapter'], pred], axis=1).head(10)

Unnamed: 0,chapter,pred_name,pred_similarity,pred_chapter
0,H1S1-08,연립이차부등식,0.992603,H1S1-08
1,H1S1-08,이차부등식,0.950958,H1S1-08
2,H1S1-11,x축 또는 y축에 접하는 원의 방정식,0.978045,H1S1-11
3,H1S1-07,정수 및 자연수 조건을 갖는 부정방정식,0.986447,H1S1-07
4,H1S1-12,도형의 평행이동 (2),0.967667,H1S1-12
5,H1S1-10,선분을 수직이등분하는 방정식,0.991257,H1S1-10
6,H1S1-09,중선정리 (파푸스의 정리),0.974618,H1S1-09
7,H1S1-03,등비수열의 활용,0.950284,HSU1-09
8,H1S1-08,연립이차부등식,0.994495,H1S1-08
9,H1S1-11,접선의 길이,0.974307,H1S1-11


Name을 먼저 예측하고, 이를 바탕으로 중단원을 매칭시켰을 때, 정확도는 약 54%로 향상되었다.

### Make feature with nth ranking of cosine similarity

In [None]:
# Set the number of candidates
ranking = 3

# Apply function to whole dataset

name_list = []
cosine_list = []
for i in tqdm(df.index):
    result = calculate_cosinesim(df.loc[i, 'text'])
    result = result.sort_values(by='result', ascending=False).reset_index(drop=True)
    name_ranking = []
    cosine_ranking = []
    for j in range(ranking): # Make a ranking of cosine similarity
        name_ranking.append(result['name'][j]) # Get the most similar name with question
        cosine_ranking.append(result['result'][j])
    name_list.append(name_ranking)
    cosine_list.append(cosine_ranking)
    
# Concatenate predict result with dataframe
pred = pd.DataFrame(list(zip(name_list, cosine_list)), columns = ['pred_name', 'pred_similarity'])
pred.index = df.index
pred['pred_chapter'] = 0
for i in df.index:
    chapter_list = []
    for j in pred.loc[i, 'pred_name']:
        chapter_list.append(df[df['name']==j]['chapter'].unique()[-1])
        pred.loc[i, 'pred_chapter'] = chapter_list
pred.head()

In [None]:
pred.rename(columns = {'pred_chapter' : 'word2vec_pred_chapter'}, inplace = True)
pred.rename(columns = {'pred_name' : 'word2vec_pred_name'}, inplace=True)

In [None]:
data_preprocess = pd.read_csv(DATA_PATH+'/data_preprocessed.csv', index_col=0)
final_data = pd.concat([data_preprocess, pred['word2vec_pred_chapter'], pred['word2vec_pred_name']], axis=1)
final_data.head(2)

In [114]:
#final_data.to_csv(DATA_PATH+'/data_preprocessed.csv', index=False)

### Test Model and Submit

In [27]:
df_test = pd.read_csv(DATA_PATH+'/hidden_for_inference.csv')
df_test.head(2)

Unnamed: 0,qplay_question_id,text
0,3535,"좌표평면 위의 두 점 `A(1, 2)`, `B(5, -2)`에 대하여 선분 `AB`..."
1,4537,"`a-b=3`일 때, `a^3-3a^2b+(3b^2-5)a-b^3+5b-10`의 값은?"


In [35]:
test_pred = apply_w2v_name(df_test)
test_pred.head(2)

100%|██████████| 1378/1378 [00:25<00:00, 54.60it/s]


Unnamed: 0,pred_name,pred_similarity,pred_chapter
0,독립사건의 확률,0.960253,HSTA-04
1,곱셈공식의 활용 (2),0.992238,H1S1-01


In [41]:
submission = []
for i in range(len(df_test)):
    temp_dic = {}
    temp_dic['qplay_question_id'] = int(df_test.loc[i, 'qplay_question_id'])
    temp_dic['predict_category'] = test_pred.loc[i, 'pred_chapter']
    submission.append(temp_dic)

submission[0]

{'qplay_question_id': 3535, 'predict_category': 'HSTA-04'}

In [44]:
from pprint import pprint
import requests
import json
HOST = 'http://ec2-13-125-227-119.ap-northeast-2.compute.amazonaws.com:8080/'
TOKEN = "055c2875-d72c-4ea8-9381-a2d1323cb618"
data = {"token" : TOKEN,
        "payload": submission}
# submit your answer 
response = requests.post(f'{HOST}submit/', data = json.dumps(data))
pprint(json.loads(requests.get(f'{HOST}rank/').text))

{'message': [{'high_score': 0.0, 'team_name': '고려대학교 KUBIG'},
             {'high_score': 0.0, 'team_name': '연세대학교 YAI'},
             {'high_score': 0.7999961959691431, 'team_name': '연세대학교 ybigta'},
             {'high_score': 0.82773999316432,
              'team_name': 'mathpresso_baseline'}]}
