### Setup + Read Data

In [None]:
# Imports and Constants
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
BASE, EXT = 'data/metadata/', '_task_1_2.csv'

In [None]:
# Get data (only do this once!)
!wget https://dqanonymousdata.blob.core.windows.net/neurips-public/data.zip
!unzip data.zip

In [None]:
# Training Data
TRAIN = pd.read_csv('data/train_data/train' + EXT)
assert len(TRAIN['AnswerId'].unique())==TRAIN.shape[0]
# assert len(TRAIN['StudentId'].unique())==TRAIN.shape[0] # fails
assert all(TRAIN['IsCorrect'].isin([0,1]))
assert all(TRAIN['IsCorrect'] == (TRAIN['AnswerValue']==TRAIN['CorrectAnswer']))
TRAIN.head() # print(TRAIN.head())

In [None]:
# Subject Data
SUBJECT = pd.read_csv(BASE + 'subject_metadata.csv')
assert len(SUBJECT['SubjectId'].unique())==SUBJECT.shape[0]
assert len(SUBJECT['Name'].unique())==SUBJECT.shape[0]
assert SUBJECT['ParentId'].apply(lambda f: math.isnan(f) or f.is_integer).all()
SUBJECT.head() # print(SUBJECT.head())

In [None]:
# Question Data
QUESTION = pd.read_csv(BASE + 'question_metadata' + EXT)
QUESTION['SubjectId'] = [list(map(int,x[1:-1].split(', ')))
                         for x in QUESTION['SubjectId']]
assert len(QUESTION['QuestionId'].unique())==QUESTION.shape[0]
assert all(QUESTION['SubjectId'].apply(type) == list)
assert all(QUESTION['SubjectId'].apply(bool)) # SubjectId not empty
QUESTION.head() # print(QUESTION.head())

### Review Data

In [None]:
# Set DATA variable
DATA = SUBJECT

In [None]:
# Basic information
print('What is the type of the data: ' + str(type(DATA)))
print('Structure of the data: ' + str(DATA.shape))

In [None]:
# General information
DATA.info()

In [None]:
# First 5 entries
DATA.head()

In [None]:
# Continue exploring the data here ...

### Analysis: Understanding Trends

In [None]:
QUESTION_INFO = TRAIN.groupby('QuestionId')['IsCorrect'].agg(['mean', 'count', 'sum'])
QUESTION_INFO = QUESTION_INFO.sort_values(by='mean')
plt.plot(QUESTION_INFO['mean'].values)
plt.title('Mean of correct answers per question')
plt.xlabel('User Number')
plt.ylabel('Average Correct Answers')
plt.grid(color='gray', linestyle='-', linewidth=0.1)
plt.show()

### Prepare the data

In [None]:
# Encode the subjects into each question
subject_map = {x: y for x, y in zip(SUBJECT.SubjectId, SUBJECT.Name)}
QUESTION = QUESTION.join(QUESTION.SubjectId.explode()
  .apply(lambda x: subject_map[x]).str.get_dummies()
  .groupby(level=0).sum().astype(bool))
temp = QUESTION.iloc[0]['SubjectId'] + [1189, 130]
QUESTION = QUESTION.drop(columns=['SubjectId']).set_index('QuestionId')
for x in temp: # Last two should be False
    print(QUESTION[subject_map[x]].iloc[0])

In [None]:
# Create STUDENT features (average of correct answers per subject)
STUDENT = TRAIN.groupby('UserId')['IsCorrect'].agg(average='mean', num_questions='count')
for _, x in subject_map.items(): # Takes about 3 minutes
    select = set(QUESTION.loc[QUESTION[x] == True].index.values)
    averages = TRAIN[TRAIN['QuestionId'].isin(select)].groupby('UserId')['IsCorrect'].mean()
    STUDENT = pd.concat((STUDENT, averages), axis=1)
    STUDENT.rename(columns={'IsCorrect': x}, inplace=True)

In [None]:
# High use values
FEATURE_COUNT = len(subject_map)
QUESTION_COUNT = QUESTION.shape[0]
STUDENT_COUNT = STUDENT.shape[0]
print(FEATURE_COUNT, QUESTION_COUNT, STUDENT_COUNT)

In [None]:
# Training Data: Input (X) and Output (Y)
try: # takes 3 minutes
    X_student = TRAIN.join(STUDENT, on='UserId', how='inner', sort=True)
    X_question = TRAIN.join(QUESTION, on='QuestionId', how='inner', sort=True)
    Y = TRAIN['IsCorrect']
    print('Training data created')
    X_student = X_student.iloc[:, 8:]
    X_question = X_question.iloc[:, 6:]
except:
    print('Some error occurred')

### Machine Learning: Content-Based Filtering

In [None]:
# SCALE THE DATA

In [None]:
# Create the model
num_outputs = 32
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=num_outputs, activation='sigmoid'),
])
item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=num_outputs, activation='sigmoid'),
])
# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(FEATURE_COUNT))
vu = tf.linalg.l2_normalize(user_NN(input_user), axis=1)
# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(FEATURE_COUNT))
vm = tf.linalg.l2_normalize(item_NN(input_item), axis=1)
# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])
# specify the inputs and output of the model and compile
model = keras.Model([input_user, input_item], output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
              loss=keras.losses.BinaryCrossentropy())
model.summary()

In [None]:
# Generate Training Data - Kernel Crashes
X_student = TRAIN.join(STUDENT, on='UserId', how='inner', sort=True).iloc[:, 8:]
X_question = TRAIN.join(QUESTION, on='QuestionId', how='inner', sort=True).iloc[:, 6:]
Y = TRAIN['IsCorrect']

In [None]:
# Train the model (takes too much time)
for chunk in np.array_split(TRAIN, 30):
    X_student = chunk.join(STUDENT, on='UserId', how='inner', sort=True).iloc[:, 8:]
    X_question = chunk.join(QUESTION, on='QuestionId', how='inner', sort=True).iloc[:, 6:]
    Y = chunk['IsCorrect']
    model.fit([X_student, X_question], Y, epochs=10)