In [None]:
import pandas as pd

In [None]:
def feature_engineering(df):
    
    df['category_test_problem'] = df['assessmentItemID'].str[1:10]

    df.drop(['assessmentItemID', 'testId'], axis = 1, inplace = True)
    
    column_name_dict = {
        'userID': 'user',
        'answerCode': 'answer',
        'Timestamp': 'timestamp',
        'KnowledgeTag': 'problem_tag'
    }
    
    df.rename(columns = column_name_dict, inplace = True)
    
    df['category'] = df['category_test_problem'].str[0:3]
    df['category_test'] = df['category_test_problem'].str[0:6]

    df['category'] = df['category'].astype(int)
    df['category_test'] = df['category_test'].astype(int)
    df['category_test_problem'] = df['category_test_problem'].astype(int)
    df['problem_tag'] = df['problem_tag'].astype(int)

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    df.sort_values(by=['user', 'timestamp'], inplace = True)
    
    df['problem_time'] = df.groupby('user')['timestamp'].transform(lambda x: x.diff().shift(-1))
    df['break_time'] = df.groupby('user')['timestamp'].transform(lambda x: x.diff())
    
    df['problem_time'] = df['problem_time'].apply(lambda x: x.total_seconds()).fillna(0.0)
    df['break_time'] = df['break_time'].apply(lambda x: x.total_seconds()).fillna(0.0)
    
    return df[['user', 'category', 'category_test', 'category_test_problem', 'problem_tag', 'problem_time', 'break_time', 'answer']]

In [None]:
train = pd.read_csv('/opt/ml/input/data/train_data.csv')
test = pd.read_csv('/opt/ml/input/data/test_data.csv')

In [None]:
train['train'] = 1
test['train'] = 0

In [None]:
train = feature_engineering(train)
test = feature_engineering(test)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le_category = LabelEncoder()
le_category.fit(train['category'])
le_category_test = LabelEncoder()
le_category_test.fit(train['category_test'])
le_category_test_problem = LabelEncoder()
le_category_test_problem.fit(train['category_test_problem'])
le_problem_tag = LabelEncoder()
le_problem_tag.fit(train['problem_tag'])

In [None]:
train['category'] = le_category.transform(train['category'])
train['category_test'] = le_category_test.transform(train['category_test'])
train['category_test_problem'] = le_category_test_problem.transform(train['category_test_problem'])
train['problem_tag'] = le_problem_tag.transform(train['problem_tag'])

test['category'] = le_category.transform(test['category'])
test['category_test'] = le_category_test.transform(test['category_test'])
test['category_test_problem'] = le_category_test_problem.transform(test['category_test_problem'])
test['problem_tag'] = le_problem_tag.transform(test['problem_tag'])

In [None]:
import pickle

In [None]:
train_group = train.groupby('user').apply(lambda x: (
    x['category'].values,
    x['category_test'].values,
    x['category_test_problem'].values,
    x['problem_tag'].values,
    x['problem_time'].values,
    x['break_time'].values,
    x['answer'].values,
))
test_group = test.groupby('user').apply(lambda x: (
    x['category'].values,
    x['category_test'].values,
    x['category_test_problem'].values,
    x['problem_tag'].values,
    x['problem_time'].values,
    x['break_time'].values,
    x['answer'].values,
))

In [None]:
with open("/opt/ml/input/data/train_group.pkl.zip", 'wb') as pick:
        pickle.dump(train_group, pick)
with open("/opt/ml/input/data/test_group.pkl.zip", 'wb') as pick:
        pickle.dump(test_group, pick)