In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import catboost
import random
import os
from catboost import CatBoostRegressor, CatBoostClassifier
import lightgbm as lgb
import torch
import shap

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from transformers.models.bert.modeling_bert import (
    BertConfig,
    BertEncoder,
    BertModel,
)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


# 1. EDA

## 1) 기본 feature nunique

In [None]:
train = pd.read_csv("train_data2.csv")


In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 6))
df = train.drop(['Timestamp', 'answerCode'], axis=1)

sns.barplot(x=df.columns.tolist(), y=df.nunique(), ax=axes)

for idx, val in enumerate(df.columns.tolist()):
    axes.text(x=idx, y=df.nunique()[idx], s=df.nunique()[idx],
            va='bottom', ha='center',
            fontsize=11, fontweight='semibold'
            )

plt.show()

## 2) groupby feature

### (1) Tag

In [None]:
df2 = train.groupby(['KnowledgeTag']).nunique().drop(['answerCode', 'Timestamp'], axis=1)
df2['tag_count'] = train.groupby(['KnowledgeTag']).count()['userID']

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(150, 20))

for idx, val in enumerate(df2):
    sns.barplot(data=df2, x=df2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(df2.index.size):
    #     axes[idx].text(x=i, y=df2[val][df2.index.tolist()[i]], s=df2[val][df2.index.tolist()[i]], rotation='vertical')


plt.show()

In [None]:
df2['TagCut'] = pd.cut(df2.index, bins=df2.index.size//10)
df2_2 = df2.groupby('TagCut').mean()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(50, 20))

for idx, val in enumerate(df2_2):
    sns.barplot(data=df2_2, x=df2_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

### (2) User

In [None]:
df3 = train.groupby(['userID']).nunique().drop(['answerCode', 'Timestamp'], axis=1)
df3['user_count'] = train.groupby(['userID']).count()['testId']

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(200, 30))

for idx, val in enumerate(df3):
    sns.barplot(data=df3, x=df3.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(df3.index.size):
    #     axes[idx].text(x=i, y=df3[val][df3.index.tolist()[i]], s=df3[val][df3.index.tolist()[i]], rotation='vertical')


In [None]:
df3['UserCut'] = pd.cut(df3.index, bins=df3.index.size//10)
df3_2 = df3.groupby('UserCut').mean()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(50, 20))

for idx, val in enumerate(df3_2):
    sns.barplot(data=df3_2, x=df3_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

### 3) test

In [None]:
Test_label = {test:idx for idx, test in enumerate(sorted(train['testId'].unique()))}
train['Test_label'] = train['testId'].map(Test_label)

In [None]:
df4 = train.groupby(['Test_label']).nunique().drop(['answerCode', 'Timestamp', 'testId'], axis=1)
df4['test_count'] = train.groupby(['Test_label']).count()['userID']

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(200, 30))

for idx, val in enumerate(df4):
    sns.barplot(data=df4, x=df4.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(df4.index.size):
    #     axes[idx].text(x=i, y=df4[val][df4.index.tolist()[i]], s=df4[val][df4.index.tolist()[i]], rotation='vertical')

In [None]:
df4['TestCut'] = pd.cut(df4.index, bins=df4.index.size//10)
df4_2 = df4.groupby('TestCut').mean()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(50, 20))

for idx, val in enumerate(df4_2):
    sns.barplot(data=df4_2, x=df4_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

### 4) Assessment

In [None]:
Assess_label = {test:idx for idx, test in enumerate(sorted(train['assessmentItemID'].unique()))}
train['Assess_label'] = train['assessmentItemID'].map(Assess_label)

In [None]:
df5 = train.groupby(['Assess_label']).nunique().drop(['answerCode', 'Timestamp', 'assessmentItemID'], axis=1)
df5 = df5.drop(['Test_label'], axis=1)
df5['assess_count'] = train.groupby(['Assess_label']).count()['userID']

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(300, 30))

for idx, val in enumerate(df5):
    sns.barplot(data=df5, x=df5.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(df5.index.size):
    #     axes[idx].text(x=i, y=df5[val][df5.index.tolist()[i]], s=df5[val][df5.index.tolist()[i]], rotation='vertical')

In [None]:
df5['AssessCut'] = pd.cut(df5.index, bins=df5.index.size//20)
df5_2 = df5.groupby('AssessCut').mean()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(50, 20))

for idx, val in enumerate(df5_2):
    sns.barplot(data=df5_2, x=df5_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

## 3) 정답수 / 정답률

In [None]:
cor = pd.DataFrame()

In [None]:
train[train['answerCode']==1]['answerCode'].count()

In [None]:
train[train['answerCode']==1]['answerCode'].count() / train['answerCode'].count()

### (1) Tag별 정답

In [None]:
cor1 = cor.copy()
cor1['tag_ans_cnt'] = train.groupby(['KnowledgeTag'])['answerCode'].sum()
cor1['tag_ans_cor'] = train.groupby(['KnowledgeTag'])['answerCode'].sum() / train.groupby(['KnowledgeTag'])['answerCode'].count()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(150, 20))

for idx, val in enumerate(cor1):
    sns.barplot(data=cor1, x=cor1.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(cor1.index.size):
    #     axes[idx].text(x=i, y=cor1[val][cor1.index.tolist()[i]], s=cor1[val][cor1.index.tolist()[i]], rotation='vertical')

In [None]:
cor1['TagCut'] = pd.cut(cor1.index, bins=cor1.index.size//10)
cor1_2 = cor1.groupby('TagCut').mean()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(50, 20))

for idx, val in enumerate(cor1_2):
    sns.barplot(data=cor1_2, x=cor1_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

### (2) User별 정답

In [None]:
cor2 = cor.copy()
cor2['user_ans_cnt'] = train.groupby(['userID'])['answerCode'].sum()
cor2['user_ans_cor'] = (train.groupby(['userID'])['answerCode'].sum() / train.groupby(['userID'])['answerCode'].count())#.astype(int)

In [None]:
cor2

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(200, 30))

for idx, val in enumerate(cor2):
    sns.barplot(data=cor2, x=cor2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(cor2.index.size):
    #     axes[idx].text(x=i, y=cor2[val][cor2.index.tolist()[i]], s=cor2[val][cor2.index.tolist()[i]], rotation='vertical')

In [None]:
cor2['UserCut'] = pd.cut(cor2.index, bins=cor2.index.size//10)
cor2_2 = cor2.groupby('UserCut').mean()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(50, 20))

for idx, val in enumerate(cor2_2):
    sns.barplot(data=cor2_2, x=cor2_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

### (3) Test별 정답

In [None]:
cor3 = cor.copy()
cor3['test_ans_cnt'] = train.groupby(['Test_label'])['answerCode'].sum()
cor3['test_ans_cor'] = train.groupby(['Test_label'])['answerCode'].sum() / train.groupby(['Test_label'])['answerCode'].count()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(150, 30))

for idx, val in enumerate(cor3):
    sns.barplot(data=cor3, x=cor3.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(cor3.index.size):
    #     axes[idx].text(x=i, y=cor3[val][cor3.index.tolist()[i]], s=cor3[val][cor3.index.tolist()[i]], rotation='vertical')

In [None]:
cor3['TestCut'] = pd.cut(cor3.index, bins=cor3.index.size//10)
cor3_2 = cor3.groupby('TestCut').mean()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(50, 20))

for idx, val in enumerate(cor3_2):
    sns.barplot(data=cor3_2, x=cor3_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()

### (4) Assessment별 정답

In [None]:
cor4 = cor.copy()
cor4['test_ans_cnt'] = train.groupby(['Assess_label'])['answerCode'].sum()
cor4['test_ans_cor'] = train.groupby(['Assess_label'])['answerCode'].sum() / train.groupby(['Assess_label'])['answerCode'].count()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(300, 30))

for idx, val in enumerate(cor4):
    sns.barplot(data=cor4, x=cor4.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)
    # for i in range(cor4.index.size):
    #     axes[idx].text(x=i, y=cor4[val][cor4.index.tolist()[i]], s=cor4[val][cor4.index.tolist()[i]], rotation='vertical')

In [None]:
cor4['TestCut'] = pd.cut(cor4.index, bins=cor4.index.size//20)
cor4_2 = cor4.groupby('TestCut').mean()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(50, 20))

for idx, val in enumerate(cor4_2):
    sns.barplot(data=cor4_2, x=cor4_2.index, y=val, ax=axes[idx])
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=90, fontsize=8)

plt.show()