In [13]:
import pandas as pd
import numpy as np
import psutil
from tqdm import tqdm

In [2]:
# Print the memory use
psutil.virtual_memory().percent

37.0

### Pre-define optimal data types of the columns

In [3]:
%%time

# Read the train.csv dataset

# Define the data types
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16", 
    "content_type_id": "boolean", 
    "task_container_id": "int16",
    "user_answer": "int8", 
    "answered_correctly": "int8", 
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"    
}

# Import the records of all users
df_train = pd.read_csv("data_allusers/train.csv", dtype=dtypes)

# Print the memory use
print("Momoery usage after import train.csv: ", psutil.virtual_memory().percent)

# Read the example_test.csv dataset
df_test = pd.read_csv("data_allusers/example_test.csv", dtype=dtypes)

# Print the memory use
print("Momoery usage after import example_test.csv: ", psutil.virtual_memory().percent)

# Read the questions.csv
df_ques = pd.read_csv("data_allusers/questions.csv")

# Print the memory use
print("Momoery usage after import questions.csv: ", psutil.virtual_memory().percent)

# Read the lectures.csv
df_lecs = pd.read_csv("data_allusers/lectures.csv")

# Print the memory use
print("Momoery usage after import lectures.csv: ", psutil.virtual_memory().percent)

# Print the number of rows and columns in all datasets
df_train.shape, df_test.shape, df_ques.shape, df_lecs.shape

Momoery usage after import train.csv:  70.6
Momoery usage after import example_test.csv:  70.4
Momoery usage after import questions.csv:  70.5
Momoery usage after import lectures.csv:  70.5
Wall time: 5min 11s


((101230332, 10), (104, 11), (13523, 7), (418, 4))

### Brief Summary of `train.csv`

In [4]:
# Print the first 5 rows in train.csv
df_train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False


In [5]:
# Print the last 5 rows in train.csv
df_train.tail()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
101230327,101230327,428564420,2147482888,3586,False,22,0,1,18000.0,True
101230328,101230328,428585000,2147482888,6341,False,23,3,1,14000.0,True
101230329,101230329,428613475,2147482888,4212,False,24,3,1,14000.0,True
101230330,101230330,428649406,2147482888,6343,False,25,1,0,22000.0,True
101230331,101230331,428692118,2147482888,7995,False,26,3,1,29000.0,True


**Takeaways**
- **row_id**: ID for the row
- **timestamp**: the time between this user interaction and the first event from that user
- **content_id**: ID code for the user interaction: either answering a question or reviewing a lecture.
- **content_type_id**: 0 if the event was a question and 1 if the event was the user was watching a lecture. 
- **task_container_id**: ID code for the batch of questions or lectures.
- **user_answer**: the user's answer to the question. -1 if the user was watching a lecture.
- **answered_correctly**: if the user responded correctly. -1 if the user was watching a lecture.
- **prior_question_elapsed_time**: the avereage time it took a user to answer each question in the previous bundle, ignoring any lectures in between. 

In [6]:
# Count how many rows
df_train.row_id.nunique() # This number matches the first number in the shape.

101230332

In [7]:
# Count how many users
df_train.user_id.nunique() # There are about 390,000 users

393656

In [8]:
# Count how many unique contents
df_train.content_id.nunique() # There are 13782 unique contents

13782

In [9]:
# Count how many unique questions and lectures
df_train.content_type_id.value_counts()

False    99271300
True      1959032
Name: content_type_id, dtype: Int64

In [10]:
# Count how many uniuqe tasks 
df_train.task_container_id.nunique()

10000

In [11]:
# Count
df_train.user_answer.value_counts()

 0    28186489
 1    26990007
 3    26084784
 2    18010020
-1     1959032
Name: user_answer, dtype: int64

### Brief Summary of `questions.csv`

In [12]:
# Take a glance at the questions.csv
df_ques.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


**Takeaways**
- question_id: foreign key for the train/test contend_id column, when the content type is 0.
- bundle_id: code for which questions are served together.
- correct_answer: the answer to the question. Can be compared with the train `user_answer` column to check if the user was right.
- part: the relevant section of the TOEIC test.
- tags: one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [19]:
# Count the number of unique questions
df_ques.question_id.nunique()

13523

In [20]:
# Count the number of unique bundles
df_ques.bundle_id.nunique()

9765

In [21]:
# Count the correct answers
df_ques.correct_answer.value_counts()

0    3716
3    3544
1    3478
2    2785
Name: correct_answer, dtype: int64

In [22]:
# Compute how many parts in the system and how many unique questions each part has
df_ques.part.value_counts()

5    5511
2    1647
3    1562
4    1439
6    1212
7    1160
1     992
Name: part, dtype: int64

#### Generate a column to hold the total number of tags for each question
- Hypothesis 1: the number of tags reflect how ??? the question is.
- Hypothesis 2: the more ??? the question, the lower accuracy is the question.

In [52]:
# Split the tag column into 
df_tags = df_ques.tags.str.split(" ", expand=True)

# Change the column names
df_tags.columns = [1,2,3,4,5,6]

# Fill the nulls with 0
df_tags = df_tags.fillna(-1)

# Print the dataframe of the tags
df_tags

Unnamed: 0,1,2,3,4,5,6
0,51,131,162,38,-1,-1
1,131,36,81,-1,-1,-1
2,131,101,162,92,-1,-1
3,131,149,162,29,-1,-1
4,131,5,162,38,-1,-1
...,...,...,...,...,...,...
13518,14,-1,-1,-1,-1,-1
13519,8,-1,-1,-1,-1,-1
13520,73,-1,-1,-1,-1,-1
13521,125,-1,-1,-1,-1,-1


In [53]:
# Convert the non-zero interger into 1 by a for loop

for i in range(1,7):
    n = df_tags[i].apply(lambda i: 1 if i !=-1 else 0)
    df_tags[i] = n
    
# Print the df_tags
df_tags

Unnamed: 0,1,2,3,4,5,6
0,1,1,1,1,0,0
1,1,1,1,0,0,0
2,1,1,1,1,0,0
3,1,1,1,1,0,0
4,1,1,1,1,0,0
...,...,...,...,...,...,...
13518,1,0,0,0,0,0
13519,1,0,0,0,0,0
13520,1,0,0,0,0,0
13521,1,0,0,0,0,0


In [54]:
# Create a column to hold the total count of the tag numbers for each question
df_tags['tag_count'] = df_tags.sum(axis=1)

# Drop the columns 1-6
df_tags.drop(columns=[1,2,3,4,5,6], inplace=True)

# Print the df_tags
df_tags

Unnamed: 0,tag_count
0,4
1,3
2,4
3,4
4,4
...,...
13518,1
13519,1
13520,1
13521,1


In [57]:
# Attach the column of counting tags to the quetions.csv
df_ques = pd.concat([df_ques, df_tags], axis=1)

# Print the first 5 rows in the df_ques
df_ques.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tag_count
0,0,0,0,1,51 131 162 38,4
1,1,1,1,1,131 36 81,3
2,2,2,0,1,131 101 162 92,4
3,3,3,0,1,131 149 162 29,4
4,4,4,3,1,131 5 162 38,4


In [59]:
# # Save it to the csv
# df_ques.to_csv("questions.csv")

In [61]:
# Count the number of unique questions based on tag_count
df_ques.tag_count.value_counts()

1    6560
3    3976
4    2021
5     686
2     171
6     108
0       1
Name: tag_count, dtype: int64

### Brief Summary of `lectures.csv`

In [64]:
# Print the first 5 records
df_lecs.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


In [65]:
# Count how many unique lectures
df_lecs.lecture_id.nunique() # There are 418 unique lectures

418

In [72]:
# Coun how many unique tags
df_lecs.tag.nunique() # There are 151 unique tag numbers

151

In [66]:
# Count how many unique lectures in each type
df_lecs.type_of.value_counts()

concept             222
solving question    186
intention             7
starter               3
Name: type_of, dtype: int64

In [68]:
# Count how many unique lectures in ech part
df_lecs.part.value_counts()

5    143
6     83
2     56
1     54
7     32
4     31
3     19
Name: part, dtype: int64

In [71]:
# Add an extra column to hold the name of the parts.

map_dict = {1: "Photographs", 
            2: "Question-Response", 
            3: "Conversations", 
            4: "Talks", 
            5: "Incomplete Sentences", 
            6: "Text Completion", 
            7: "Passages"}

df_lecs['part_content'] = df_lecs.part.map(map_dict)

# Check if the column is successfully created
df_lecs.head()

Unnamed: 0,lecture_id,tag,part,type_of,part_content
0,89,159,5,concept,Incomplete Sentences
1,100,70,1,concept,Photographs
2,185,45,6,concept,Text Completion
3,192,79,5,solving question,Incomplete Sentences
4,317,156,5,solving question,Incomplete Sentences


In [74]:
# # Save as csv file
# df_lecs.to_csv("lectures.csv")

### Brief Summary of `example_test.csv`

In [75]:
# Take a peek at the test
df_test.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,False,0,,,[],[]
1,1,0,13309898705,554169193,12010,False,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,False,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,False,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,False,162,72400.0,True,,


**Takeawasys**
- Extra columns comparing to the train.csv:

### Subtract the History of Users in the Test Dataset

In [10]:
# Subtract user ids in the example test dataset
user_ids_test = df_test['user_id'].unique()

# Convert to the list
user_ids_test = list(user_ids_test)

# Print how many users in the example test dataset
print(f"There are {len(user_ids_test)} users in the example test dataset")

# Print the user ids
print(user_ids_test)

There are 42 users in the example test dataset
[275030867, 554169193, 1720860329, 288641214, 1728340777, 1364159702, 1521618396, 1317245193, 1700555100, 998511398, 1422853669, 1096784725, 385471210, 1202386221, 2018567473, 1233875513, 891955351, 1981166446, 1637273633, 2030979309, 319060572, 98059812, 674533997, 555691277, 775113212, 1219481379, 1148874033, 1281335472, 2002570769, 706626847, 1357500007, 1599808246, 1305988022, 1310228392, 2093197291, 1468996389, 1838324752, 2103436554, 311890082, 1817433235, 1900527744, 7792299]


In [15]:
%%time
# Compute how long my laptop will complete the task

# Subtract the history of the users in the example test dataset

# Create an empty dataframe
df = pd.DataFrame(columns=df_train.columns)

for user_id in tqdm(user_ids_test):
    if (df_train.user_id == user_id).sum() > 0: 
        mask = df_train.user_id == user_id
        user = df_train[mask]
        df = df.append(user, ignore_index=True)
    else:
        continue

# Print the shape of the df
print(df.shape)

# Print the first 5 rows in df
df.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:14<00:00,  2.86it/s]

(72719, 10)
Wall time: 14.7 s





Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,25924315,0,554169193,3927,False,0,1,1,,
1,25924316,24904,554169193,939,False,1,0,1,23000.0,False
2,25924317,59273,554169193,6683,False,2,1,0,19000.0,False
3,25924318,95917,554169193,5658,False,3,3,1,30000.0,False
4,25924319,112132,554169193,6202,False,4,2,1,31000.0,False


In [72]:
# # Save df as csv
# df.to_csv("users_history_test.csv")

### The Detaisl of Time-series API 
- The API provides user interactions groups in the order in which they occurred. Each group will contain interactions from many different users, but no more than one `task_container_id` of questions from any single user. Each group has between 1 and 1000 users.
- Expected to see roughly 2.5 million questions in the hidden test set. 