In [None]:
import os
import pandas as pd
from datetime import datetime, timedelta
import random
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_file = 'train_data.csv'
test_file = 'test_data.csv'

data_path = '/opt/ml/input/data/train_dataset'
train_path = os.path.join(data_path, train_file)
test_path = os.path.join(data_path, test_file)

In [None]:
train_df = pd.read_csv(train_path, parse_dates=['Timestamp'])
test_df = pd.read_csv(test_path, parse_dates=['Timestamp'])

In [None]:
train_df['tmp_index'] = train_df.index

In [None]:
tmp_df = train_df[['userID', 'testId', 'Timestamp', 'tmp_index']].shift(1)

In [None]:
tmp_df['tmp_index'] += 1

In [None]:
train_df = train_df.merge(tmp_df, how='left', on=['userID', 'testId', 'tmp_index'])

In [None]:
train_df['prior_elapsed'] = (train_df.Timestamp_x - train_df.Timestamp_y).dt.seconds

In [None]:
train_df['prior_elapsed'].plot(kind='kde')


In [None]:
train_df['log_prior'] = np.log1p(train_df['prior_elapsed'])

In [None]:
train_df['log_prior'].plot(kind='kde')

In [None]:
Q1 = train_df['prior_elapsed'].quantile(0.25)
Q3 = train_df['prior_elapsed'].quantile(0.75)

In [None]:
Q1

In [None]:
Q3

In [None]:
len(train_df)

In [None]:
(train_df['prior_elapsed'] > 65).sum()

In [None]:
470314/ 2266586

In [None]:
IQR = Q3 - Q1

In [None]:
IQR

In [None]:
low_bound = Q1 - 1.5 * IQR 
upper_bound = Q3 + 1.5 * IQR

In [None]:
low_bound

In [None]:
upper_bound

In [None]:
(train_df['prior_elapsed'] > upper_bound).sum()

In [None]:
193417 / 2266586

In [None]:
Q99 = train_df['prior_elapsed'].quantile(0.98)

In [None]:
Q99

In [None]:
train_df['prior_elapsed'].median()

In [None]:
train_df[train_df['prior_elapsed'] <= 260]['prior_elapsed'].median()

In [None]:
train_df.groupby('assessmentItemID')['prior_elapsed'].get_group('A010001001')

In [None]:
train_df.loc[train_df['assessmentItemID'] == 'A010001001']

In [None]:
assess_time = train_df.groupby('assessmentItemID').prior_elapsed.mean()
assess_time.name = 'mean_elapsed'

In [None]:
assess_time

In [None]:
a = train_df.groupby('testId').prior_elapsed.mean()

In [None]:
a.name = 'test_time'

In [None]:
a

In [None]:
train_df = train_df.merge(a, how='left', on='testId')

In [None]:
train_df['test_mean'] = train_df.groupby('testId').prior_elapsed.transform(lambda x:x.mean())

In [None]:
train_df[['answerCode', 'test_mean']].corr()

In [None]:
train_df['test_mean'].plot(kind='kde')

In [None]:
train_df['log_test_mean'] = np.log1p(train_df['test_mean'])

In [None]:
train_df['log_test_mean'].plot(kind='kde')

In [None]:
train_df[['answerCode', 'log_test_mean']].corr()

In [None]:
data = train_df.groupby(['userID', 'answerCode', 'Timestamp']).apply(
    lambda x: (
        x['answerCode'].values,
        x['Timestamp'].values,
    )
)

In [None]:
random.seed(42)
random.shuffle(data)

In [None]:
size = int(len(data) * 0.7)
data_1 = data[:size]
data_2 = data[size:]

In [None]:
# train / val 의 마지막 정답비율을 일치시킨다

In [None]:

total = np.array([])
for user in data_1:
    answer = user[1][-1]
    total = np.append(total, answer)
    

In [None]:
total.mean()

In [None]:
total = np.array([])
for user in data_2:
    answer = user[1][-1]
    total = np.append(total, answer)


In [None]:
total.mean()

In [None]:
train_df.groupby('userID').answerCode.last().mean()

In [None]:
(x for x in range(10))

In [None]:
last_assess = train_df.groupby(['userID']).last()['assessmentItemID'].reset_index()

In [None]:
train_df.loc[ (train_df['userID'] == 7436) & (train_df['assessmentItemID'] == 'A030019001') ]

In [None]:
all_right = 0
better_right = 0
worse_fail = 0
all_fail = 0

for i, val in last_assess.iterrows():
    bb = train_df.loc[ (train_df['userID'] == val.userID) & (train_df['assessmentItemID'] == val.assessmentItemID) ]
    if len(bb) > 1:
        answerCode = bb.iloc[-1]['answerCode']
        prior_answerCode = bb.iloc[-2]['answerCode']
        
        if prior_answerCode == 0:
            if answerCode == 1:
                better_right += 1
            else:
                all_fail += 1
        else:
            if answerCode == 1:
                all_right += 1
            else:
                worse_fail += 1


In [None]:
all_right

In [None]:
better_right

In [None]:
worse_fail

In [None]:
all_fail

In [None]:
36 / (45+36+36+30)

In [None]:
last_assess

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df['grade'] = train_df.testId.str[2]

In [None]:
train_df['grade'] = train_df['grade'].astype(int)

In [None]:
train_df.groupby('userID').grade.mean()

In [None]:
len(train_df.groupby('userID').grade.mean())

In [None]:
st = np.random.randint(10, size=(6698,2))

In [None]:
st

In [None]:
st

In [None]:
t, v = train_test_split(train_df.groupby('userID').grade.mean(), train_size=0.7, stratify=st)

In [None]:
t

In [None]:
v

In [None]:
len(t) / (len(t) + len(v))

In [None]:
train_df.iloc[v].grade.mean()

In [None]:
train_df.iloc[t].grade.mean()

In [None]:
train_df.groupby('userID').grade.mean()

In [None]:
test_df.head(1)

In [None]:
user_group = test_df.groupby('userID')

test_df['grade'] = test_df.testId.str[2]

test_df['grade'] = test_df['grade'].astype(int)

In [None]:
user_group.last().grade.plot(kind='kde')

In [None]:
user_group.answerCode.mean().plot(kind='kde')
plt.axvline(user_group.answerCode.mean().median())

In [None]:
user_group.answerCode.mean().mean()

In [None]:
user_group.answerCode.mean().median()

In [None]:
user_group = train_df.groupby('userID')

train_df['grade'] = train_df.testId.str[2]

train_df['grade'] = train_df['grade'].astype(int)

In [None]:
user_group.answerCode.mean().plot(kind='kde')
plt.axvline(user_group.answerCode.mean().median())

In [None]:
user_group.answerCode.last().plot(kind='kde')
# plt.axvline(user_group.answerCode.last().median())

In [None]:
r = user_group.answerCode.last()

In [None]:
r.describe()

In [None]:
user_group.count().grade.plot(kind='kde')

In [None]:
test_group = test_df.groupby('userID')

In [None]:
test_group.count().grade.plot(kind='kde')

In [None]:
### big mean을 맞춰라

In [None]:
train_df['big'] = train_df.testId.str[2].astype(int)
test_df['big'] = test_df.testId.str[2].astype(int)

In [None]:
train_df.groupby('big').answerCode.mean()

In [None]:
test_df.groupby('big').answerCode.mean()

In [None]:
plt.plot(train_df.groupby('big').answerCode.mean())
plt.plot(test_df.groupby('big').answerCode.mean())

In [None]:
train_df.big.value_counts()

In [None]:
train_df.big.value_counts() / len(train_df)

In [None]:
test_df.big.value_counts()

In [None]:
test_df.big.value_counts() / len(test_df)

In [None]:
test_df.groupby(['big']).answerCode.mean()

In [None]:
train_df['split'] = (train_df['big']*10 + train_df['answerCode']).astype(int)

In [None]:
len(train_df) * 0.2 * 0.047152

In [None]:
21375 / 119385

In [None]:
len(train_df) * 0.2 * 0.047152 * 0.498492

In [None]:
0.2 * 0.047152 * 0.498492

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df.split.value_counts()

In [None]:
train_df[train_df.split == 90]

In [None]:
t, v = train_test_split(train_df[train_df.split == 90], test_size=0.17)

In [None]:
len(t)

In [None]:
len(v)

In [None]:
### 구하기

In [None]:
train_len = len(train_df)

In [None]:
test_count_ratio = (test_df.big.value_counts() / len(test_df)).sort_index()

In [None]:
test_count_ratio

In [None]:
test_correct_ratio = test_df.groupby('big').answerCode.mean()

In [None]:
test_correct_ratio

In [None]:
ratio = 0.5

In [None]:
train_ratio = [0]
valid_ratio = [0]
for grade in range(1, 10):
    count_needed = train_len * ratio * test_count_ratio[grade] * test_correct_ratio[grade]
    train_ratio.append(count_needed / train_count[grade])
    
    count_needed = train_len * ratio * test_count_ratio[grade] * (1 - test_correct_ratio[grade])
    valid_ratio.append(count_needed / train_count[grade])
    

In [None]:
train_ratio

In [None]:
valid_ratio

In [None]:
grouped = train_df.groupby('userID').last()

In [None]:
len(grouped)

In [None]:
train_set = []
valid_set = []
for grade in range(1, 10):
    valid, train = train_test_split( train_df[train_df['split'] == (grade * 10)].index.values.tolist() , train_size=train_ratio[grade])
    train_set += train
    valid_set += valid
    
    valid, train = train_test_split( train_df[train_df['split'] == (grade * 10) + 1].index.values.tolist() , train_size=valid_ratio[grade])
    train_set += train
    valid_set += valid
    
    

In [None]:
train_df.iloc[train_set].shape

In [None]:
train_df.iloc[valid_set].shape

In [None]:
len(valid_set)

In [None]:
len(train_df)

In [None]:
len(valid_set) / train_len

In [None]:
len(train_set) / train_len

In [None]:
###

In [None]:
last_index = test_df.groupby('userID').last().index

In [None]:
drop_last = test_df.drop(index=last_index)

In [None]:
drop_last.groupby('userID').answerCode.mean().mean()

In [None]:
train_df.groupby('userID').answerCode.mean().mean()

In [None]:
t, v = train_test_split(train_df.userID.unique(), train_size=0.7, random_state=0)

In [None]:
len(t)

In [None]:
len(v)

In [None]:
train_df.loc[train_df['userID'].isin(t)].groupby('userID').answerCode.mean().mean()

In [None]:
train_df.loc[~train_df['userID'].isin(v)].groupby('userID').answerCode.mean().mean()

In [None]:
while True:
    t, v = train_test_split(train_df.userID.unique(), train_size=0.7, random_state=0)
    m = train_df.loc[~train_df['userID'].isin(v)].groupby('userID').answerCode.mean().mean()
    if m > 0.605 and m < 0.615:
        break

In [None]:
train_df['answer_mean'] = train_df.groupby('userID').answerCode.transform(lambda x: x.mean())

In [None]:
lasts = train_df.groupby('userID').last().reset_index()

In [None]:
prob = 0.
user_ids = []

In [None]:
upper_index = lasts[lasts.answer_mean > 0.61].index.tolist()
under_index = lasts[lasts.answer_mean <= 0.61].index.tolist()

In [None]:
len(under_index)

In [None]:
len(upper_index)

In [None]:
for _ in range(int(len(lasts) * 0.2)):
    if prob > 0.61:
        i = under_index.pop(0)
    else:
        i = upper_index.pop(0)
    
    user_ids.append(i)
    prob = lasts.iloc[user_ids].answer_mean.mean()
    
    

In [None]:
len(user_ids)

In [None]:
len(under_index)

In [None]:
len(upper_index)

In [None]:
prob

In [None]:
a = train_df.groupby('userID').last().reset_index()

In [None]:
a

In [None]:
lasts[lasts['userID'].isin(user_ids)].answerCode

In [None]:
train_df.groupby('userID').cumcount()

In [None]:
cum['cummean'] = cum['answer_mean']['cumsum'] / cum['answer_mean']['cumcount']

In [None]:
cum

In [None]:
train_df['tmp_index'] = train_df.index
last_index = train_df.groupby('userID').last()
train_df = train_df.drop(index=last_index['tmp_index'])

In [None]:
last_index = train_df.groupby('userID').last()

In [None]:
last_index.reset_index()