In [1]:
import pandas as pd
import numpy as np
import psutil

In [2]:
# Print the memory use
psutil.virtual_memory().percent

58.9

### Pre-define optimal data types of the columns

In [3]:
%%time

# Read the train.csv dataset

# Define the data types
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16", 
    "content_type_id": "boolean", 
    "task_container_id": "int16",
    "user_answer": "int8", 
    "answered_correctly": "int8", 
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"    
}

# Import the records of all users
df_train = pd.read_csv("data_allusers/train.csv", dtype=dtypes)

# Print the memory use
print("Momoery usage after import train.csv: ", psutil.virtual_memory().percent)

# Read the example_test.csv dataset
df_test = pd.read_csv("data_allusers/example_test.csv", dtype=dtypes)

# Print the memory use
print("Momoery usage after import example_test.csv: ", psutil.virtual_memory().percent)

# Read the questions.csv
df_ques = pd.read_csv("data_allusers/questions.csv")

# Print the memory use
print("Momoery usage after import questions.csv: ", psutil.virtual_memory().percent)

# Read the lectures.csv
df_lecs = pd.read_csv("data_allusers/lectures.csv")

# Print the memory use
print("Momoery usage after import lectures.csv: ", psutil.virtual_memory().percent)

# Print the number of rows and columns in all datasets
df_train.shape, df_test.shape, df_ques.shape, df_lecs.shape

Momoery usage after import train.csv:  40.2
Momoery usage after import example_test.csv:  37.9
Momoery usage after import questions.csv:  37.9
Momoery usage after import lectures.csv:  37.9
CPU times: user 3min 6s, sys: 10.2 s, total: 3min 17s
Wall time: 3min 18s


((101230332, 10), (104, 11), (13523, 5), (418, 4))

### Brief Summary of `train.csv`

In [4]:
# Take a look at the train.csv
df_train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False


**Takeaways**
- row_id: ID for the row
- timestamp: the time between this user interaction and the first event from that user
- content_id: ID code for the user interaction: either answering a question or reviewing a lecture.
- content_type_id: 0 if the event was a question and 1 if the event was the user was watching a lecture. 
- task_container_id: ID code for the batch of questions or lectures.
- user_answer: the user's answer to the question. -1 if the user was watching a lecture.
- answered_correctly: if the user responded correctly. -1 if the user was watching a lecture.
- prior_question_elapsed_time: the avereage time it took a user to answer each question in the previous bundle, ignoring any lectures in between. 

In [15]:
# Count how many rows
df_train.row_id.nunique()

101230332

In [10]:
# Count how many users
df_train.user_id.nunique() # There are about 390,000 users

393656

In [11]:
# Count how many unique contents
df_train.content_id.nunique() # There are 13782 unique contents

13782

In [16]:
# Count how many unique questions and lectures
df_train.content_type_id.value_counts()

False    99271300
True      1959032
Name: content_type_id, dtype: Int64

In [17]:
# Count how many uniuqe tasks 
df_train.task_container_id.nunique()

10000

In [18]:
# Count
df_train.user_answer.value_counts()

 0    28186489
 1    26990007
 3    26084784
 2    18010020
-1     1959032
Name: user_answer, dtype: int64

### Brief Summary of `questions.csv`

In [5]:
# Take a glance at the questions.csv
df_ques.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [8]:
# Count the number of unique questions
df_ques.question_id.nunique()

13523

In [9]:
# Count the number of unique bundles
df_ques.bundle_id.nunique()

9765

**Takeaways**
1. Two id columns:
    - question_id: foreign key for the train/test contend_id column, when the content type is 0.
        - There are 13523 unique questions in the dataset

### Brief Summary of `lectures.csv`

In [13]:
# Print the first 5 records
df_lecs.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


In [14]:
# Count how many unique lectures
df_lecs.lecture_id.nunique()

418

### Brief Summary of `example_test.csv`

In [12]:
# Take a peek at the test
df_test.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,False,0,,,[],[]
1,1,0,13309898705,554169193,12010,False,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,False,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,False,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,False,162,72400.0,True,,


**Takeawasys**
- Extra columns comparing to the train.csv:

In [25]:
# Subtract user ids in the example test dataset
user_ids_test = df_test['user_id'].unique()

# Print how many users in the example test dataset
print(f"There are {user_ids_test.shape[0]} users in the example test dataset")

# Print the user ids
user_ids_test

There are 42 users in the example test dataset


array([ 275030867,  554169193, 1720860329,  288641214, 1728340777,
       1364159702, 1521618396, 1317245193, 1700555100,  998511398,
       1422853669, 1096784725,  385471210, 1202386221, 2018567473,
       1233875513,  891955351, 1981166446, 1637273633, 2030979309,
        319060572,   98059812,  674533997,  555691277,  775113212,
       1219481379, 1148874033, 1281335472, 2002570769,  706626847,
       1357500007, 1599808246, 1305988022, 1310228392, 2093197291,
       1468996389, 1838324752, 2103436554,  311890082, 1817433235,
       1900527744,    7792299], dtype=int32)

In [None]:
# Subtract the history of the users in the example test dataset

for user in user_ids_test:
    if df_train.