### Setup: Run once

In [None]:
# Imports and Constants
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [None]:
# Get data
!wget https://dqanonymousdata.blob.core.windows.net/neurips-public/data.zip
!unzip data.zip

In [None]:
# Break cell execution: Credit https://stackoverflow.com/questions/24005221/ipython-notebook-early-exit-from-cell/56953105#56953105
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

### Read Data

In [None]:
# Training Data
TRAIN = pd.read_csv('data/train_data/train_task_1_2.csv')
TRAIN = TRAIN.rename({'UserId': 'StudentId'}, axis=1)
assert len(TRAIN['AnswerId'].unique())==TRAIN.shape[0]
# assert len(TRAIN['StudentId'].unique())==TRAIN.shape[0] # fails
assert all(TRAIN['IsCorrect'].isin([0,1]))
assert all(TRAIN['IsCorrect'] == (TRAIN['AnswerValue']==TRAIN['CorrectAnswer']))
display(TRAIN)

In [None]:
# Subject Data
SUBJECT = pd.read_csv('data/metadata/subject_metadata.csv').query('Level < 3')
assert len(SUBJECT['SubjectId'].unique())==SUBJECT.shape[0]
assert len(SUBJECT['Name'].unique())==SUBJECT.shape[0]
assert SUBJECT['ParentId'].apply(lambda f: math.isnan(f) or f.is_integer).all()
display(SUBJECT)

In [None]:
# Question Data
QUESTION = pd.read_csv('data/metadata/question_metadata_task_1_2.csv')
def limit_subjects(subject_list): # Use only level<3 subjects
  return [x for x in subject_list if x in SUBJECT['SubjectId'].values]
QUESTION['SubjectId'] = [limit_subjects(list(map(int,x[1:-1].split(', '))))
                         for x in QUESTION['SubjectId']]
assert len(QUESTION['QuestionId'].unique())==QUESTION.shape[0]
assert all(QUESTION['SubjectId'].apply(type) == list)
assert all(QUESTION['SubjectId'].apply(bool)) # SubjectId not empty
assert len(TRAIN['QuestionId'].unique())==QUESTION.shape[0]
QUESTION = QUESTION.set_index('QuestionId').sort_index()
display(QUESTION)

### Prepare the data

In [None]:
# Encode the subjects into each question
subject_map = {x: y for x, y in zip(SUBJECT.SubjectId, SUBJECT.Name)}
QUESTION = QUESTION.join(QUESTION.SubjectId.explode()
  .apply(lambda x: subject_map[x]).str.get_dummies()
  .groupby(level=0).sum().astype(bool))
temp = QUESTION.iloc[0]['SubjectId'] + [1189, 130]
results = [QUESTION[subject_map[x]].iloc[0] for x in temp]
assert results == [True, True, True, False, False]
QUESTION = QUESTION.drop(columns=['SubjectId'])
assert QUESTION.shape[1] == SUBJECT.shape[0]
assert max(QUESTION.index) == QUESTION.shape[0] - 1
assert QUESTION.any().any()
display(QUESTION)

In [None]:
# Create STUDENT features (average of correct answers per subject)
STUDENT = TRAIN.groupby('StudentId')['IsCorrect'].agg(average='mean', num_questions='count')
assert len(TRAIN['StudentId'].unique())==STUDENT.shape[0]
assert max(STUDENT.index) == STUDENT.shape[0] - 1
for _, x in sorted(subject_map.items(), key = lambda x: x[1]):
    select = set(QUESTION.loc[QUESTION[x] == True].index.values)
    averages = TRAIN[TRAIN['QuestionId'].isin(select)].groupby('StudentId')['IsCorrect'].mean()
    STUDENT = pd.concat((STUDENT, averages), axis=1)
    STUDENT.rename(columns={'IsCorrect': x}, inplace=True)
assert STUDENT.shape[1] - 2 == SUBJECT.shape[0]
display(STUDENT)

In [None]:
# High use values
FEATURE_COUNT = len(subject_map)
QUESTION_COUNT = QUESTION.shape[0]
STUDENT_COUNT = STUDENT.shape[0]
average = TRAIN['IsCorrect'].mean()
print(FEATURE_COUNT, QUESTION_COUNT, STUDENT_COUNT, average)

### Review Data

In [None]:
# Set DATA variable
DATA = SUBJECT

In [None]:
# Basic information
print('What is the type of the data: ' + str(type(DATA)))
print('Structure of the data: ' + str(DATA.shape))

In [None]:
# General information
DATA.info()

In [None]:
# First 5 entries
DATA.head()

In [None]:
# Continue exploring the data here ...

### Analysis: Understanding Trends

In [None]:
# Naive Solution: Predictions are the average answer for a question
test_nn = False
print("RUN \"TEST THE MODEL\" CELLS TO TEST THE NAIVE SOLUTION")
NAIVE_PRED = TRAIN.groupby('QuestionId')['IsCorrect'].mean()
print(NAIVE_PRED)

In [None]:
# Distribution of correct answers
QUESTION_INFO = TRAIN.groupby('QuestionId')['IsCorrect'].agg(['mean', 'count', 'sum'])
QUESTION_INFO = QUESTION_INFO.sort_values(by='mean')
plt.plot(QUESTION_INFO['mean'].values)
plt.title('Mean of correct answers per question')
plt.xlabel('User Number')
plt.ylabel('Average Correct Answers')
plt.grid(color='gray', linestyle='-', linewidth=0.1)
plt.show()

In [None]:
# Subject Integrity: Quantity vs. Diversity
num_responces = TRAIN.groupby('QuestionId')['QuestionId'].count().values
quantity = QUESTION.mul(num_responces, axis=0).sum()
diversity = QUESTION.sum(axis=0).values
plt.scatter(diversity, quantity)
plt.title('Understanding the Subjects')
plt.xlabel('Number of questions about the subject')
plt.ylabel('Total student responces to the subject')
plt.grid(color='gray', linestyle='-', linewidth=0.1)
plt.show()

### Machine Learning: Content-Based Filtering

In [None]:
# Create the model: Credit Coursera Machine Learning Specialization
num_outputs = 32
student_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=num_outputs, activation='sigmoid'),
])
question_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=num_outputs, activation='sigmoid'),
])
# create the user input and point to the base network
input_student = tf.keras.layers.Input(shape=(FEATURE_COUNT))
vs = tf.linalg.l2_normalize(student_NN(input_student), axis=1)
# create the item input and point to the base network
input_question = tf.keras.layers.Input(shape=(FEATURE_COUNT))
vq = tf.linalg.l2_normalize(question_NN(input_question), axis=1)
# compute the dot product of the two vectors vs and vq
output = tf.keras.layers.Dot(axes=1)([vs, vq])
# specify the inputs and output of the model and compile
loss_fn = keras.losses.BinaryCrossentropy()
model = keras.Model([input_student, input_question], output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001),
              loss='BinaryCrossentropy')
model.summary()

In [None]:
# Train the model
test_nn = True
for epoch in range(10):
    print("EPOCH", epoch + 1)
    for chunk in np.array_split(TRAIN, 10):
        X_student = chunk.join(STUDENT, on='StudentId', how='inner', sort=True).iloc[:, 8:].fillna(average)
        X_question = chunk.join(QUESTION, on='QuestionId', how='inner', sort=True).iloc[:, 6:].fillna(average)
        assert X_student.shape[1] == X_question.shape[1] == SUBJECT.shape[0]
        Y = chunk['IsCorrect']
        assert X_student.shape[0] == X_question.shape[0] == len(Y)
        model.fit([X_student, X_question], Y)
        print('Chunk average:', chunk['IsCorrect'].mean())
        # raise StopExecution

### Test the model

In [None]:
# Test Data
TEST = pd.read_csv('data/test_data/test_public_answers_task_1.csv')
TEST = TEST.rename({'UserId': 'StudentId'}, axis=1)
assert len(TEST['QuestionId'].unique()) <= QUESTION.shape[0]
assert TEST['StudentId'].isin(STUDENT.index).all()
display(TEST)

In [None]:
# Test the Model
pred_correct, pred_count = 0, 0
conf_mtx = [[0, 0], [0, 0]] # confusion matrix
for chunk in np.array_split(TEST, 5):
    if test_nn: # Test Neural Network
        X_student = chunk.join(STUDENT, on='StudentId', how='inner', sort=True).iloc[:, 6:].fillna(average)
        X_question = chunk.join(QUESTION, on='QuestionId', how='inner', sort=True).iloc[:, 4:].fillna(average)
        assert X_student.shape[1] == X_question.shape[1] == SUBJECT.shape[0]
        chunk['Submission'] = model.predict([X_student, X_question]).round()
        assert all(chunk['Submission'].isin([0,1]))
    else: # Test Naive Model
        chunk['Submission'] = chunk['QuestionId'].map(NAIVE_PRED).round()
    conf_mtx[0][0] += np.sum((chunk['IsCorrect']==0) & (chunk['Submission']==0))
    conf_mtx[0][1] += np.sum((chunk['IsCorrect']==0) & (chunk['Submission']==1))
    conf_mtx[1][0] += np.sum((chunk['IsCorrect']==1) & (chunk['Submission']==0))
    conf_mtx[1][1] += np.sum((chunk['IsCorrect']==1) & (chunk['Submission']==1))
    pred_correct += np.sum(chunk['IsCorrect']==chunk['Submission'])
    pred_count += chunk.shape[0]
    # if test_nn:
    #     raise StopExecution

In [None]:
# Print Results: Credit Eedi Starter Kit
accuracy = pred_correct / pred_count
print("accuracy:", accuracy)
print("correct:", pred_correct, "  ", "total:", pred_count)
conf_mtx = np.divide(conf_mtx, pred_count)
display(pd.DataFrame(conf_mtx, index=['true_0', 'true_1'], columns=['pred_0', 'pred_1']))