Importing Libraries

In [2]:
import pandas as pd # used for dataframes
import numpy as np 
import xgboost as xgb # Gradient Boosting Algorithm
import matplotlib.pyplot as plt
import seaborn as sns
import gc as gc # Garbage Collector required to extract unused and residual data and variables from memory
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
fields1 = ['username', 'course_id', 'action', 'truth'] # specific columns to load into the dataframe
fields2 = ['username', 'course_id', 'time']
gc.enable()

Loading the data

In [4]:
data_train_action = pd.read_csv('archive/train/train.csv', usecols = fields1, nrows = 14582760) # load specific columns from train.csv
data_train_action.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14582760 entries, 0 to 14582759
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   username   int64 
 1   course_id  object
 2   action     object
 3   truth      int64 
dtypes: int64(2), object(2)
memory usage: 445.0+ MB


In [5]:
data_train_time = pd.read_csv('archive/train/train.csv', usecols = fields2, nrows = 14582760) # load specific columns from train.csv
data_train_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14582760 entries, 0 to 14582759
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   username   int64 
 1   course_id  object
 2   time       object
dtypes: int64(1), object(2)
memory usage: 333.8+ MB


In [6]:
data_test_action = pd.read_csv('archive/test/test.csv', usecols = fields1, nrows = 6472430) # load specific columns from test.csv
data_test_action.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6472430 entries, 0 to 6472429
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   username   int64 
 1   course_id  object
 2   action     object
 3   truth      int64 
dtypes: int64(2), object(2)
memory usage: 197.5+ MB


In [7]:
data_test_time = pd.read_csv('archive/test/test.csv', usecols = fields2, nrows = 6472430) # load specific columns from test.csv
data_test_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6472430 entries, 0 to 6472429
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   username   int64 
 1   course_id  object
 2   time       object
dtypes: int64(1), object(2)
memory usage: 148.1+ MB


Data Wrangling and Feature Engineering

In [8]:
data_train_action = pd.get_dummies(data_train_action, columns = ['action']) # Getting dummies of 'action' column to convert 'object' type data into float values
data_test_action = pd.get_dummies(data_test_action, columns = ['action'])

In [9]:
data_train_time['Datetime'] = pd.to_datetime(data_train_time['time']) # Converting 'time' column of 'object' type
data_test_time['Datetime'] = pd.to_datetime(data_test_time['time'])

In [10]:
data_train_time = data_train_time.drop(['time'], axis = 1) # Dropping 'time' column to reduce memory usage
data_test_time = data_test_time.drop(['time'], axis = 1)

In [11]:
gc.collect()

0

In [12]:
data_train_time['timestamp'] = data_train_time.Datetime.values.astype(np.int64) // 10 ** 9 # Converting data from 'datetime' type to timestamp
data_test_time['timestamp'] = data_test_time.Datetime.values.astype(np.int64) // 10 ** 9

In [13]:
data_train_time = data_train_time.drop(['Datetime'], axis = 1) # Dropping 'Datetime' column to reduce memory usage
data_test_time = data_test_time.drop(['Datetime'], axis = 1)

In [14]:
gc.collect()

0

In [15]:
data_train_time['time_difference'] = pd.DataFrame(data_train_time.timestamp.diff()) # Calculating difference in timestamps of consecutive activities
data_test_time['time_difference'] = pd.DataFrame(data_test_time.timestamp.diff())

In [16]:
data_train_time = data_train_time.groupby(['username', 'course_id']).sum() # Grouping data into unique user-course pairs
data_train_time = pd.DataFrame(data_train_time.reset_index())

data_test_time = data_test_time.groupby(['username', 'course_id']).sum()
data_test_time = pd.DataFrame(data_test_time.reset_index())

data_train_action = pd.DataFrame(data_train_action.groupby(['username', 'course_id']).sum())
data_train_action = pd.DataFrame(data_train_action.reset_index())

data_test_action = pd.DataFrame(data_test_action.groupby(['username', 'course_id']).sum())
data_test_action = pd.DataFrame(data_test_action.reset_index())

In [17]:
data_train = pd.merge(data_train_action, data_train_time, left_index = True, right_index = True)
data_train# merging data_train_time and data_train_action into a single dataframe

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
0,5,TsinghuaX/60510102X/_,11,0,0,0,0,0,1,0,...,0,0,0,0,7,0,5,TsinghuaX/60510102X/_,15891279255,-458434.0
1,5,TsinghuaX/70240183x/2015_T2,43,0,0,0,0,0,3,0,...,0,1,0,0,19,3,5,TsinghuaX/70240183x/2015_T2,62075438024,-467607.0
2,5,TsinghuaX/80511503X/2015_T2,59,0,0,0,0,0,9,0,...,0,0,0,0,0,2,5,TsinghuaX/80511503X/2015_T2,85396836502,780540.0
3,5,TsinghuaX/80515522X/2015_T1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,5,TsinghuaX/80515522X/2015_T1,4333944105,2159964.0
4,5,course-v1:TsinghuaX+00804723X+2016_T1,0,3,2,0,2,0,1,0,...,0,0,0,0,8,0,5,course-v1:TsinghuaX+00804723X+2016_T1,39547264222,-969609.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79972,6909928,course-v1:TsinghuaX+80512073X+2017-T1,24,5,5,0,4,0,1,0,...,0,1,1,0,1,0,6909928,course-v1:TsinghuaX+80512073X+2017-T1,35793721308,1878215.0
79973,6929776,course-v1:TsinghuaX+80000271X+2017_T1,76,2,8,0,2,0,4,0,...,0,0,0,0,30,2,6929776,course-v1:TsinghuaX+80000271X+2017_T1,113372945178,-329748.0
79974,6935761,course-v1:TsinghuaX+80000271X+2017_T1,6,3,0,0,0,0,0,0,...,0,0,0,0,0,0,6935761,course-v1:TsinghuaX+80000271X+2017_T1,8951615687,1738804.0
79975,6937204,course-v1:TsinghuaX+80000271X+2017_T1,160,4,24,0,4,0,12,0,...,1,4,0,0,11,7,6937204,course-v1:TsinghuaX+80000271X+2017_T1,238720827574,1261966.0


In [18]:
del data_train_action
del data_train_time

In [19]:
gc.collect()

0

In [20]:
data_train

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
0,5,TsinghuaX/60510102X/_,11,0,0,0,0,0,1,0,...,0,0,0,0,7,0,5,TsinghuaX/60510102X/_,15891279255,-458434.0
1,5,TsinghuaX/70240183x/2015_T2,43,0,0,0,0,0,3,0,...,0,1,0,0,19,3,5,TsinghuaX/70240183x/2015_T2,62075438024,-467607.0
2,5,TsinghuaX/80511503X/2015_T2,59,0,0,0,0,0,9,0,...,0,0,0,0,0,2,5,TsinghuaX/80511503X/2015_T2,85396836502,780540.0
3,5,TsinghuaX/80515522X/2015_T1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,5,TsinghuaX/80515522X/2015_T1,4333944105,2159964.0
4,5,course-v1:TsinghuaX+00804723X+2016_T1,0,3,2,0,2,0,1,0,...,0,0,0,0,8,0,5,course-v1:TsinghuaX+00804723X+2016_T1,39547264222,-969609.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79972,6909928,course-v1:TsinghuaX+80512073X+2017-T1,24,5,5,0,4,0,1,0,...,0,1,1,0,1,0,6909928,course-v1:TsinghuaX+80512073X+2017-T1,35793721308,1878215.0
79973,6929776,course-v1:TsinghuaX+80000271X+2017_T1,76,2,8,0,2,0,4,0,...,0,0,0,0,30,2,6929776,course-v1:TsinghuaX+80000271X+2017_T1,113372945178,-329748.0
79974,6935761,course-v1:TsinghuaX+80000271X+2017_T1,6,3,0,0,0,0,0,0,...,0,0,0,0,0,0,6935761,course-v1:TsinghuaX+80000271X+2017_T1,8951615687,1738804.0
79975,6937204,course-v1:TsinghuaX+80000271X+2017_T1,160,4,24,0,4,0,12,0,...,1,4,0,0,11,7,6937204,course-v1:TsinghuaX+80000271X+2017_T1,238720827574,1261966.0


In [21]:
data_test = pd.merge(data_test_action, data_test_time, left_index = True, right_index = True) # merging data_test_time and data_test_action into a single dataframe

In [22]:
del data_test_action
del data_test_time

In [23]:
gc.collect()

0

In [24]:
data_train['truth'] = np.where(data_train['truth'] >= 1, 1,0) # Converting all non-zero values into 1, sine the XGBoost Classifier algorithm requires binary (0 or 1) as 
# input data
"""
data_train['action_click_about'] = np.where(data_train['action_click_about'] >= 1, 1,0)
data_train['action_click_courseware'] = np.where(data_train['action_click_courseware'] >= 1, 1,0)
data_train['action_click_forum'] = np.where(data_train['action_click_forum'] >= 1, 1,0)
data_train['action_click_info'] = np.where(data_train['action_click_info'] >= 1, 1,0)
data_train['action_click_progress'] = np.where(data_train['action_click_progress'] >= 1, 1,0)
data_train['action_close_courseware'] = np.where(data_train['action_close_courseware'] >= 1, 1,0)
data_train['action_delete_comment'] = np.where(data_train['action_delete_comment'] >= 1, 1,0)
data_train['action_load_video'] = np.where(data_train['action_load_video'] >= 1, 1,0)
data_train['action_pause_video'] = np.where(data_train['action_pause_video'] >= 1, 1,0)
data_train['action_play_video'] = np.where(data_train['action_play_video'] >= 1, 1,0)
data_train['action_problem_check_correct'] = np.where(data_train['action_problem_check_correct'] >= 1, 1,0)
data_train['action_problem_get'] = np.where(data_train['action_problem_get'] >= 1, 1,0)
data_train['action_problem_save'] = np.where(data_train['action_problem_save'] >= 1, 1,0)
data_train['action_seek_video'] = np.where(data_train['action_seek_video'] >= 1, 1,0)
"""

"\ndata_train['action_click_about'] = np.where(data_train['action_click_about'] >= 1, 1,0)\ndata_train['action_click_courseware'] = np.where(data_train['action_click_courseware'] >= 1, 1,0)\ndata_train['action_click_forum'] = np.where(data_train['action_click_forum'] >= 1, 1,0)\ndata_train['action_click_info'] = np.where(data_train['action_click_info'] >= 1, 1,0)\ndata_train['action_click_progress'] = np.where(data_train['action_click_progress'] >= 1, 1,0)\ndata_train['action_close_courseware'] = np.where(data_train['action_close_courseware'] >= 1, 1,0)\ndata_train['action_delete_comment'] = np.where(data_train['action_delete_comment'] >= 1, 1,0)\ndata_train['action_load_video'] = np.where(data_train['action_load_video'] >= 1, 1,0)\ndata_train['action_pause_video'] = np.where(data_train['action_pause_video'] >= 1, 1,0)\ndata_train['action_play_video'] = np.where(data_train['action_play_video'] >= 1, 1,0)\ndata_train['action_problem_check_correct'] = np.where(data_train['action_problem_

In [25]:
data_test['truth'] = np.where(data_test['truth'] >= 1, 1,0)
"""
data_test['action_click_about'] = np.where(data_test['action_click_about'] >= 1, 1,0)
data_test['action_click_courseware'] = np.where(data_test['action_click_courseware'] >= 1, 1,0)
data_test['action_click_forum'] = np.where(data_test['action_click_forum'] >= 1, 1,0)
data_test['action_click_info'] = np.where(data_test['action_click_info'] >= 1, 1,0)
data_test['action_click_progress'] = np.where(data_test['action_click_progress'] >= 1, 1,0)
data_test['action_close_courseware'] = np.where(data_test['action_close_courseware'] >= 1, 1,0)
data_test['action_delete_comment'] = np.where(data_test['action_delete_comment'] >= 1, 1,0)
data_test['action_load_video'] = np.where(data_test['action_load_video'] >= 1, 1,0)
data_test['action_pause_video'] = np.where(data_test['action_pause_video'] >= 1, 1,0)
data_test['action_play_video'] = np.where(data_test['action_play_video'] >= 1, 1,0)
data_test['action_problem_check_correct'] = np.where(data_test['action_problem_check_correct'] >= 1, 1,0)
data_test['action_problem_get'] = np.where(data_test['action_problem_get'] >= 1, 1,0)
data_test['action_problem_save'] = np.where(data_test['action_problem_save'] >= 1, 1,0)
data_test['action_seek_video'] = np.where(data_test['action_seek_video'] >= 1, 1,0)
"""

"\ndata_test['action_click_about'] = np.where(data_test['action_click_about'] >= 1, 1,0)\ndata_test['action_click_courseware'] = np.where(data_test['action_click_courseware'] >= 1, 1,0)\ndata_test['action_click_forum'] = np.where(data_test['action_click_forum'] >= 1, 1,0)\ndata_test['action_click_info'] = np.where(data_test['action_click_info'] >= 1, 1,0)\ndata_test['action_click_progress'] = np.where(data_test['action_click_progress'] >= 1, 1,0)\ndata_test['action_close_courseware'] = np.where(data_test['action_close_courseware'] >= 1, 1,0)\ndata_test['action_delete_comment'] = np.where(data_test['action_delete_comment'] >= 1, 1,0)\ndata_test['action_load_video'] = np.where(data_test['action_load_video'] >= 1, 1,0)\ndata_test['action_pause_video'] = np.where(data_test['action_pause_video'] >= 1, 1,0)\ndata_test['action_play_video'] = np.where(data_test['action_play_video'] >= 1, 1,0)\ndata_test['action_problem_check_correct'] = np.where(data_test['action_problem_check_correct'] >= 1, 

In [26]:
data_train.head(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
0,5,TsinghuaX/60510102X/_,1,0,0,0,0,0,1,0,...,0,0,0,0,7,0,5,TsinghuaX/60510102X/_,15891279255,-458434.0
1,5,TsinghuaX/70240183x/2015_T2,1,0,0,0,0,0,3,0,...,0,1,0,0,19,3,5,TsinghuaX/70240183x/2015_T2,62075438024,-467607.0
2,5,TsinghuaX/80511503X/2015_T2,1,0,0,0,0,0,9,0,...,0,0,0,0,0,2,5,TsinghuaX/80511503X/2015_T2,85396836502,780540.0
3,5,TsinghuaX/80515522X/2015_T1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,5,TsinghuaX/80515522X/2015_T1,4333944105,2159964.0
4,5,course-v1:TsinghuaX+00804723X+2016_T1,0,3,2,0,2,0,1,0,...,0,0,0,0,8,0,5,course-v1:TsinghuaX+00804723X+2016_T1,39547264222,-969609.0
5,5,course-v1:TsinghuaX+70800232X+2016_T1,1,2,8,0,2,0,4,0,...,0,0,0,0,40,0,5,course-v1:TsinghuaX+70800232X+2016_T1,113924515149,1734114.0
6,26,course-v1:TsinghuaX+00690863X+2017_T1,1,2,0,2,2,0,0,0,...,0,0,0,0,0,0,26,course-v1:TsinghuaX+00690863X+2017_T1,11910565689,-718378.0
7,35,course-v1:TsinghuaX+30240184+2015_T2,1,0,2,0,1,0,2,0,...,0,0,0,0,0,0,35,course-v1:TsinghuaX+30240184+2015_T2,11546675773,-2050776.0
8,36,course-v1:TsinghuaX+20220214X_2015_2+2015_T2,1,1,2,0,1,0,0,0,...,0,0,0,0,0,0,36,course-v1:TsinghuaX+20220214X_2015_2+2015_T2,10104347863,1811268.0
9,36,course-v1:TsinghuaX+80512073X_2015_2+2015_T2,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,36,course-v1:TsinghuaX+80512073X_2015_2+2015_T2,4334035214,-1982756.0


In [27]:
data_test.head(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_create_comment,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
0,5,course-v1:Tsinghua+20150001+2015_T2,1,6,6,0,2,0,3,0,...,0,0,0,0,7,0,5,course-v1:Tsinghua+20150001+2015_T2,42081403537,-1186617.0
1,5,course-v1:TsinghuaX+30240184+2015_T2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,5,course-v1:TsinghuaX+30240184+2015_T2,1443116557,-1288543.0
2,5,course-v1:UQx+Crime101x+_,1,2,23,0,6,0,11,0,...,1,3,0,0,0,2,5,course-v1:UQx+Crime101x+_,126816691568,-358296.0
3,26,TsinghuaX/80511503X/2015_T2,1,0,0,0,0,0,3,0,...,0,0,0,0,0,1,26,TsinghuaX/80511503X/2015_T2,15895867836,-2799802.0
4,32,course-v1:TsinghuaX+30640014+2016_T2,1,2,0,0,2,0,0,0,...,0,0,0,0,0,0,32,course-v1:TsinghuaX+30640014+2016_T2,8806813078,-471991.0
5,55,TsinghuaX/AP000004X/2015_T2,1,0,0,0,0,0,51,4,...,3,140,0,0,3,0,55,TsinghuaX/AP000004X/2015_T2,345415184208,407008.0
6,60,course-v1:TsinghuaX+00690212X+2015_T2,1,0,0,0,0,0,0,0,...,0,6,5,0,0,0,60,course-v1:TsinghuaX+00690212X+2015_T2,15902082368,-1297662.0
7,97,TsinghuaX/00690342X/2015_T1,1,0,0,0,0,0,4,0,...,0,0,0,0,0,0,97,TsinghuaX/00690342X/2015_T1,10099975233,454094.0
8,116,TsinghuaX/10450012X/2015_T2,1,0,0,0,0,0,4,1,...,0,0,0,0,0,0,116,TsinghuaX/10450012X/2015_T2,8655170699,-1212262.0
9,116,TsinghuaX/70240183x/2015_T2,1,0,0,0,0,0,5,0,...,1,7,0,0,2,8,116,TsinghuaX/70240183x/2015_T2,115493170251,-2024793.0


In [28]:
train_length = len(data_train)
print(train_length)

79977


In [29]:
data_train.head(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
0,5,TsinghuaX/60510102X/_,1,0,0,0,0,0,1,0,...,0,0,0,0,7,0,5,TsinghuaX/60510102X/_,15891279255,-458434.0
1,5,TsinghuaX/70240183x/2015_T2,1,0,0,0,0,0,3,0,...,0,1,0,0,19,3,5,TsinghuaX/70240183x/2015_T2,62075438024,-467607.0
2,5,TsinghuaX/80511503X/2015_T2,1,0,0,0,0,0,9,0,...,0,0,0,0,0,2,5,TsinghuaX/80511503X/2015_T2,85396836502,780540.0
3,5,TsinghuaX/80515522X/2015_T1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,5,TsinghuaX/80515522X/2015_T1,4333944105,2159964.0
4,5,course-v1:TsinghuaX+00804723X+2016_T1,0,3,2,0,2,0,1,0,...,0,0,0,0,8,0,5,course-v1:TsinghuaX+00804723X+2016_T1,39547264222,-969609.0
5,5,course-v1:TsinghuaX+70800232X+2016_T1,1,2,8,0,2,0,4,0,...,0,0,0,0,40,0,5,course-v1:TsinghuaX+70800232X+2016_T1,113924515149,1734114.0
6,26,course-v1:TsinghuaX+00690863X+2017_T1,1,2,0,2,2,0,0,0,...,0,0,0,0,0,0,26,course-v1:TsinghuaX+00690863X+2017_T1,11910565689,-718378.0
7,35,course-v1:TsinghuaX+30240184+2015_T2,1,0,2,0,1,0,2,0,...,0,0,0,0,0,0,35,course-v1:TsinghuaX+30240184+2015_T2,11546675773,-2050776.0
8,36,course-v1:TsinghuaX+20220214X_2015_2+2015_T2,1,1,2,0,1,0,0,0,...,0,0,0,0,0,0,36,course-v1:TsinghuaX+20220214X_2015_2+2015_T2,10104347863,1811268.0
9,36,course-v1:TsinghuaX+80512073X_2015_2+2015_T2,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,36,course-v1:TsinghuaX+80512073X_2015_2+2015_T2,4334035214,-1982756.0


In [30]:
data_train.tail(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
79967,6900650,course-v1:TsinghuaX+80512073X+2017-T1,0,6,8,0,0,0,4,0,...,7,11,2,5,0,0,6900650,course-v1:TsinghuaX+80512073X+2017-T1,108847731514,1974888.0
79968,6906225,course-v1:TsinghuaX+80000271X+2017_T1,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,6906225,course-v1:TsinghuaX+80000271X+2017_T1,5965050645,968704.0
79969,6908448,course-v1:TsinghuaX+80000271X+2017_T1,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,6908448,course-v1:TsinghuaX+80000271X+2017_T1,19389857557,1586506.0
79970,6908931,course-v1:TsinghuaX+80512073X+2017-T1,0,2,14,4,10,4,6,0,...,0,0,0,0,2,2,6908931,course-v1:TsinghuaX+80512073X+2017-T1,104394430315,119785.0
79971,6909756,course-v1:TsinghuaX+80000271X+2017_T1,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,6909756,course-v1:TsinghuaX+80000271X+2017_T1,5965604051,615124.0
79972,6909928,course-v1:TsinghuaX+80512073X+2017-T1,1,5,5,0,4,0,1,0,...,0,1,1,0,1,0,6909928,course-v1:TsinghuaX+80512073X+2017-T1,35793721308,1878215.0
79973,6929776,course-v1:TsinghuaX+80000271X+2017_T1,1,2,8,0,2,0,4,0,...,0,0,0,0,30,2,6929776,course-v1:TsinghuaX+80000271X+2017_T1,113372945178,-329748.0
79974,6935761,course-v1:TsinghuaX+80000271X+2017_T1,1,3,0,0,0,0,0,0,...,0,0,0,0,0,0,6935761,course-v1:TsinghuaX+80000271X+2017_T1,8951615687,1738804.0
79975,6937204,course-v1:TsinghuaX+80000271X+2017_T1,1,4,24,0,4,0,12,0,...,1,4,0,0,11,7,6937204,course-v1:TsinghuaX+80000271X+2017_T1,238720827574,1261966.0
79976,6939280,course-v1:TsinghuaX+80000271X+2017_T1,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,6939280,course-v1:TsinghuaX+80000271X+2017_T1,4476239606,1524231.0


In [31]:
data_train1 = data_train.loc[:int(train_length/2)] # Splitting data_train into two halves to make training efficient

In [32]:
data_train1.head(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
0,5,TsinghuaX/60510102X/_,1,0,0,0,0,0,1,0,...,0,0,0,0,7,0,5,TsinghuaX/60510102X/_,15891279255,-458434.0
1,5,TsinghuaX/70240183x/2015_T2,1,0,0,0,0,0,3,0,...,0,1,0,0,19,3,5,TsinghuaX/70240183x/2015_T2,62075438024,-467607.0
2,5,TsinghuaX/80511503X/2015_T2,1,0,0,0,0,0,9,0,...,0,0,0,0,0,2,5,TsinghuaX/80511503X/2015_T2,85396836502,780540.0
3,5,TsinghuaX/80515522X/2015_T1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,5,TsinghuaX/80515522X/2015_T1,4333944105,2159964.0
4,5,course-v1:TsinghuaX+00804723X+2016_T1,0,3,2,0,2,0,1,0,...,0,0,0,0,8,0,5,course-v1:TsinghuaX+00804723X+2016_T1,39547264222,-969609.0
5,5,course-v1:TsinghuaX+70800232X+2016_T1,1,2,8,0,2,0,4,0,...,0,0,0,0,40,0,5,course-v1:TsinghuaX+70800232X+2016_T1,113924515149,1734114.0
6,26,course-v1:TsinghuaX+00690863X+2017_T1,1,2,0,2,2,0,0,0,...,0,0,0,0,0,0,26,course-v1:TsinghuaX+00690863X+2017_T1,11910565689,-718378.0
7,35,course-v1:TsinghuaX+30240184+2015_T2,1,0,2,0,1,0,2,0,...,0,0,0,0,0,0,35,course-v1:TsinghuaX+30240184+2015_T2,11546675773,-2050776.0
8,36,course-v1:TsinghuaX+20220214X_2015_2+2015_T2,1,1,2,0,1,0,0,0,...,0,0,0,0,0,0,36,course-v1:TsinghuaX+20220214X_2015_2+2015_T2,10104347863,1811268.0
9,36,course-v1:TsinghuaX+80512073X_2015_2+2015_T2,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,36,course-v1:TsinghuaX+80512073X_2015_2+2015_T2,4334035214,-1982756.0


In [33]:
data_train1.tail(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
39979,1084862,course-v1:TsinghuaX+10421094X_2015_2+2015_T2,1,0,6,0,2,0,6,0,...,0,0,0,0,49,8,1084862,course-v1:TsinghuaX+10421094X_2015_2+2015_T2,127092438710,-1261788.0
39980,1084862,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,0,1,73,0,8,0,72,0,...,0,64,0,0,96,4602,1084862,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,7291541289380,-579573.0
39981,1084862,course-v1:TsinghuaX+10610224X+2017_T1,1,6,10,0,2,0,4,0,...,0,0,0,0,0,3,1084862,course-v1:TsinghuaX+10610224X+2017_T1,67068053861,486928.0
39982,1084862,course-v1:TsinghuaX+20120143X_2015_T2+2015_T2,1,0,4,0,1,0,4,0,...,0,0,0,0,20,2,1084862,course-v1:TsinghuaX+20120143X_2015_T2+2015_T2,59209333943,140292.0
39983,1084994,course-v1:TsinghuaX+10430494X_2015_2+2015_T2,1,1,9,1,2,2,7,0,...,3,33,1,0,2,4,1084994,course-v1:TsinghuaX+10430494X_2015_2+2015_T2,154477285827,-692411.0
39984,1084994,course-v1:TsinghuaX+20220053X_2015_T2+2015_T2,1,1,6,0,1,0,6,0,...,3,13,6,0,1,1,1084994,course-v1:TsinghuaX+20220053X_2015_T2+2015_T2,103917127091,596072.0
39985,1084994,course-v1:TsinghuaX+20250064+2015_T2,1,2,21,0,6,0,8,0,...,7,21,0,0,3,4,1084994,course-v1:TsinghuaX+20250064+2015_T2,183683761819,-1714754.0
39986,1085115,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,0,3,156,3,4,7,124,0,...,0,117,0,0,45,183,1085115,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,1292031134831,1085749.0
39987,1085156,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,0,2,47,0,7,4,35,0,...,0,59,17,0,25,696,1085156,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,1443386405132,-703603.0
39988,1085156,course-v1:TsinghuaX+30640014+2016_T2,1,26,301,2,25,16,147,0,...,7,153,33,0,29,78,1085156,course-v1:TsinghuaX+30640014+2016_T2,2023459083557,237678.0


In [34]:
data_train2 = data_train.loc[(int(train_length/2) + 1):]

In [35]:
data_train2.head(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
39989,1085393,course-v1:TsinghuaX+10610224X+2017_T1,1,0,4,0,0,0,1,0,...,1,1,2,0,1,0,1085393,course-v1:TsinghuaX+10610224X+2017_T1,25313070242,1302127.0
39990,1085628,TsinghuaX/20740042X/2015_T2,1,0,0,0,0,0,13,0,...,0,6,0,0,8,8,1085628,TsinghuaX/20740042X/2015_T2,93827704875,-1240561.0
39991,1085628,course-v1:TsinghuaX+00670122X+2016_T1,1,0,10,0,2,0,5,0,...,0,0,0,0,0,3,1085628,course-v1:TsinghuaX+00670122X+2016_T1,55664749581,1518220.0
39992,1085628,course-v1:TsinghuaX+20440333_2015X+2015_T2,1,2,3,1,4,0,3,0,...,0,2,0,0,0,1,1085628,course-v1:TsinghuaX+20440333_2015X+2015_T2,36089215477,615325.0
39993,1085642,course-v1:TsinghuaX+00680082_1X+2016_T2,1,3,4,3,9,0,1,0,...,0,2,0,0,0,0,1085642,course-v1:TsinghuaX+00680082_1X+2016_T2,39830996834,-337330.0
39994,1085642,course-v1:TsinghuaX+10610224X+2017_T1,1,4,80,0,4,0,35,0,...,1,2,0,0,0,22,1085642,course-v1:TsinghuaX+10610224X+2017_T1,376960593575,-583863.0
39995,1085654,course-v1:TsinghuaX+00680082_1X+2016_T1,0,2,179,6,4,14,88,0,...,2,152,0,0,5,22,1085654,course-v1:TsinghuaX+00680082_1X+2016_T1,1236173251029,-1306989.0
39996,1085654,course-v1:TsinghuaX+00690342X+2016_T1,1,6,101,0,4,4,48,0,...,0,32,0,0,38,18,1085654,course-v1:TsinghuaX+00690342X+2016_T1,1082220725309,-692793.0
39997,1085654,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,0,15,80,6,14,8,45,0,...,3,35,0,0,8,26,1085654,course-v1:TsinghuaX+10610183X_2015_T2+2015_T2,679984226460,-1917441.0
39998,1085654,course-v1:TsinghuaX+10610224X+2017_T1,1,8,91,0,8,2,45,0,...,0,2,0,0,0,32,1085654,course-v1:TsinghuaX+10610224X+2017_T1,495774239107,1002984.0


In [36]:
data_train2.tail(10)

Unnamed: 0,username_x,course_id_x,truth,action_click_about,action_click_courseware,action_click_forum,action_click_info,action_click_progress,action_close_courseware,action_close_forum,...,action_problem_check_incorrect,action_problem_get,action_problem_save,action_reset_problem,action_seek_video,action_stop_video,username_y,course_id_y,timestamp,time_difference
79967,6900650,course-v1:TsinghuaX+80512073X+2017-T1,0,6,8,0,0,0,4,0,...,7,11,2,5,0,0,6900650,course-v1:TsinghuaX+80512073X+2017-T1,108847731514,1974888.0
79968,6906225,course-v1:TsinghuaX+80000271X+2017_T1,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,6906225,course-v1:TsinghuaX+80000271X+2017_T1,5965050645,968704.0
79969,6908448,course-v1:TsinghuaX+80000271X+2017_T1,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,6908448,course-v1:TsinghuaX+80000271X+2017_T1,19389857557,1586506.0
79970,6908931,course-v1:TsinghuaX+80512073X+2017-T1,0,2,14,4,10,4,6,0,...,0,0,0,0,2,2,6908931,course-v1:TsinghuaX+80512073X+2017-T1,104394430315,119785.0
79971,6909756,course-v1:TsinghuaX+80000271X+2017_T1,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,6909756,course-v1:TsinghuaX+80000271X+2017_T1,5965604051,615124.0
79972,6909928,course-v1:TsinghuaX+80512073X+2017-T1,1,5,5,0,4,0,1,0,...,0,1,1,0,1,0,6909928,course-v1:TsinghuaX+80512073X+2017-T1,35793721308,1878215.0
79973,6929776,course-v1:TsinghuaX+80000271X+2017_T1,1,2,8,0,2,0,4,0,...,0,0,0,0,30,2,6929776,course-v1:TsinghuaX+80000271X+2017_T1,113372945178,-329748.0
79974,6935761,course-v1:TsinghuaX+80000271X+2017_T1,1,3,0,0,0,0,0,0,...,0,0,0,0,0,0,6935761,course-v1:TsinghuaX+80000271X+2017_T1,8951615687,1738804.0
79975,6937204,course-v1:TsinghuaX+80000271X+2017_T1,1,4,24,0,4,0,12,0,...,1,4,0,0,11,7,6937204,course-v1:TsinghuaX+80000271X+2017_T1,238720827574,1261966.0
79976,6939280,course-v1:TsinghuaX+80000271X+2017_T1,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,6939280,course-v1:TsinghuaX+80000271X+2017_T1,4476239606,1524231.0


In [37]:
del data_train

In [38]:
gc.collect()

0

Setting Features and Labels from dataframes for Training

In [39]:
train_labels1 = data_train1['truth']
train_features1 = data_train1[['timestamp', 'time_difference',
                            'action_click_about', 'action_click_courseware', 'action_click_forum', 
                             'action_click_info', 'action_click_progress', 'action_close_courseware', 
                             'action_delete_comment', 'action_load_video', 'action_pause_video', 'action_play_video', 'action_problem_check_correct',
                             'action_problem_get', 'action_problem_save', 'action_seek_video']]

x_train1 = train_features1
y_train1 = np.ravel(train_labels1)

In [40]:
train_labels2 = data_train2['truth']
train_features2 = data_train2[['timestamp', 'time_difference',
                            'action_click_about', 'action_click_courseware', 'action_click_forum', 
                             'action_click_info', 'action_click_progress', 'action_close_courseware', 
                             'action_delete_comment', 'action_load_video', 'action_pause_video', 'action_play_video', 'action_problem_check_correct',
                             'action_problem_get', 'action_problem_save', 'action_seek_video']]

x_train2 = train_features2
y_train2 = np.ravel(train_labels2)

In [41]:
test_length = len(data_test)
print(test_length)

34343


In [42]:
data_test1 = data_test.loc[:int(test_length/4)] # Splitting data_train to make testing efficient

Setting Features and Labels from dataframes for Testing

In [43]:
test_labels1 = data_test1['truth']
test_features1 = data_test1[['timestamp', 'time_difference',
                            'action_click_about', 'action_click_courseware', 'action_click_forum', 
                             'action_click_info', 'action_click_progress', 'action_close_courseware', 
                             'action_delete_comment', 'action_load_video', 'action_pause_video', 'action_play_video', 'action_problem_check_correct',
                             'action_problem_get', 'action_problem_save', 'action_seek_video']]

x_test1 = test_features1
y_test1 = np.ravel(test_labels1)

Creating and Training a XGBClassifier() binary model

In [44]:
model1 = xgb.XGBClassifier(
    tree_method = "hist", device = "cuda"  # THE MAGICAL PARAMETER THAT INTEGRATES KAGGLE'S GPU ACCELERATED KERNEL
)
%time model1.fit(x_train1, y_train1) # Fitting the data into the model

CPU times: total: 1.41 s
Wall time: 626 ms


In [45]:
# model1.save_model('model1.model')

Testing the trained model

In [46]:
%time y_pred1 = model1.predict(x_test1)
accuracy1 = accuracy_score(y_test1, y_pred1)
print("Model 1 Accuracy: %.2f%%" % (accuracy1 * 100.0))

CPU times: total: 62.5 ms
Wall time: 12 ms
Model 1 Accuracy: 82.42%


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [47]:
'''model2 = xgb.XGBClassifier()
model2.fit(x_train2, y_train2)
y_pred2 = model2.predict(x_test2)
accuracy2 = accuracy_score(y_test2, predictions2)
print("Model 2 Accuracy: %.2f%%" % (accuracy2 * 100.0))'''

'model2 = xgb.XGBClassifier()\nmodel2.fit(x_train2, y_train2)\ny_pred2 = model2.predict(x_test2)\naccuracy2 = accuracy_score(y_test2, predictions2)\nprint("Model 2 Accuracy: %.2f%%" % (accuracy2 * 100.0))'

In [48]:
# model2_update = 

In [49]:
'''y_pred2_update = model2_update.predict(x_test2)
accuracy2 = accuracy_score(y_test2, predictions2)
print("Model 2 Accuracy: %.2f%%" % (accuracy2 * 100.0))'''

'y_pred2_update = model2_update.predict(x_test2)\naccuracy2 = accuracy_score(y_test2, predictions2)\nprint("Model 2 Accuracy: %.2f%%" % (accuracy2 * 100.0))'

In [50]:
'''model_loaded = xgb.XGBClassifier()
booster = xgb.Booster()
booster.load_model('../input/mooc-final/model1.model')
model_loaded._Booster = booster

%time y_pred1 = model_loaded.predict(x_test1) '''

"model_loaded = xgb.XGBClassifier()\nbooster = xgb.Booster()\nbooster.load_model('../input/mooc-final/model1.model')\nmodel_loaded._Booster = booster\n\n%time y_pred1 = model_loaded.predict(x_test1) "