    Team ID:16  Sem:V  Div:A  Dept:Computer Science
    Title ID:5DMACP03       
    Project Title:Riiid! Answer Correctness Prediction

    Problem Statement: Analyze the information a complete education app would have i.e. a student's historic performance, the performance of other students on the same question, metadata about the question, etc. and predict whether students are able to answer their next questions correctly. 

        Names              USN            Roll Number
        ATUL KUMAR         01FE18BCS056     156		
        ISHA BHANDARY	  01FE18BCS063     163
        DEEPIKA KULKARNI   01FE18BCS070     170
        ARPITA MATTIHAL	  01FE18BCS048     148



In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [2]:
#Loading the csv files
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

train = pd.read_csv('./DMA/train.csv',dtype=dtypes,nrows=10**6)
lectures = pd.read_csv('./DMA/lectures.csv')
questions = pd.read_csv('./DMA/questions.csv')
test = pd.read_csv('./DMA/example_test.csv')

<h3>DATA CLEANING</h3>

In [3]:
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


In [4]:
#Checking the null values in lectures
(lectures.isnull().sum()/lectures.shape[0])*100

lecture_id    0.0
tag           0.0
part          0.0
type_of       0.0
dtype: float64

In [5]:
#Checking the null values in train
(train.isnull().sum()/train.shape[0])*100

row_id                            0.0000
timestamp                         0.0000
user_id                           0.0000
content_id                        0.0000
content_type_id                   0.0000
task_container_id                 0.0000
user_answer                       0.0000
answered_correctly                0.0000
prior_question_elapsed_time       2.3723
prior_question_had_explanation    0.3816
dtype: float64

In [6]:
#Checking the null values in questions
(questions.isnull().sum()/questions.shape[0])*100

question_id       0.000000
bundle_id         0.000000
correct_answer    0.000000
part              0.000000
tags              0.007395
dtype: float64

In [7]:
#Finding the question with no tags
questions[questions.tags.isnull()]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
10033,10033,10033,2,6,


In [8]:
#Filling the missing tag value with the 6 most frequent tags that appear with part 6
List=questions.query(‘part==6’)
tag_n=list.tag.value_counts()[:6]
Print(tag_n)
questions.tags.fillna('27 53 73 1 179 96',inplace=True)

<b>Filling the missing values in prior question elapsed time with -1.
Here, the value could be null if it is the user's first interaction or in case the current or previous interaction is a lecture.

In [9]:
train.prior_question_elapsed_time.replace(np.nan,-1,inplace=True)
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,-1.0,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


<b>Filling the missing values in prior question had explanation with false.
Here, the value could be null if it is the user's first interaction.

In [10]:
train.prior_question_had_explanation.replace(np.nan,'False',inplace=True)
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,-1.0,False
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


<h3>Data Transformation and Feature Extraction</h3>

<b>Discretization for prior_question_elapsed_time

In [11]:
train['prior_question_elapsed_time']=pd.cut(train.prior_question_elapsed_time,bins=[0,15000,30000,50000,500000], labels=['very quick','quick','moderate','slow'])
train.prior_question_elapsed_time.replace(np.nan,-1,inplace=True)
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,False
1,1,56943,115,5716,0,2,2,1,moderate,False
2,2,118363,115,128,0,0,0,1,slow,False
3,3,131167,115,7860,0,3,0,1,quick,False
4,4,137965,115,7922,0,4,1,1,very quick,False


<b>Ordinal encoding for prior_question_elapsed_time

In [15]:
import category_encoders as ce

# create object of Ordinalencoding
encoder= ce.OrdinalEncoder(cols=['prior_question_elapsed_time'])
train.prior_question_elapsed_time = encoder.fit_transform(train)
train.prior_question_elapsed_time.replace(5,-1,inplace=True)
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,0,False
1,1,56943,115,5716,0,2,2,1,1,False
2,2,118363,115,128,0,0,0,1,2,False
3,3,131167,115,7860,0,3,0,1,3,False
4,4,137965,115,7922,0,4,1,1,4,False


<b>Changing the datatype of tags in questions.csv to string and then splitting into 6 columns:tag1,tag2,tag3,tag4,tag5,tag6

In [17]:
questions.tags=questions.tags.astype(str)
questions[['Tag1', 'Tag2','Tag3','Tag4','Tag5','Tag6']] = questions['tags'].apply(lambda x: pd.Series(x.split(' ')))
questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6
0,0,0,0,1,51 131 162 38,51,131,162,38.0,,
1,1,1,1,1,131 36 81,131,36,81,,,
2,2,2,0,1,131 101 162 92,131,101,162,92.0,,
3,3,3,0,1,131 149 162 29,131,149,162,29.0,,
4,4,4,3,1,131 5 162 38,131,5,162,38.0,,


<b>Changing the datatype of prior_question_had_explanation from bool to int

In [19]:
train.prior_question_had_explanation=(train['prior_question_had_explanation'] == 'TRUE').astype(np.int8)
train.head()

  res_values = method(rvalues)


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,0,0
1,1,56943,115,5716,0,2,2,1,1,0
2,2,118363,115,128,0,0,0,1,2,0
3,3,131167,115,7860,0,3,0,1,3,0
4,4,137965,115,7922,0,4,1,1,4,0


<h3>Data Reduction</h3>

In [24]:
questions.drop(columns=['correct_answer','bundle_id','tags'],axis=1,inplace=True)
questions.head()

Unnamed: 0,question_id,part,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6
0,0,1,51,131,162,38.0,,
1,1,1,131,36,81,,,
2,2,1,131,101,162,92.0,,
3,3,1,131,149,162,29.0,,
4,4,1,131,5,162,38.0,,


In [22]:
lectures.drop(columns=['type_of'],axis=1,inplace=True)
lectures.head()

Unnamed: 0,lecture_id,tag,part
0,89,159,5
1,100,70,1
2,185,45,6
3,192,79,5
4,317,156,5


In [26]:
train.drop(columns=['user_answer','task_container_id','row_id'],axis=1,inplace=True)
train.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,115,5692,0,1,0,0
1,56943,115,5716,0,1,1,0
2,118363,115,128,0,1,2,0
3,131167,115,7860,0,1,3,0
4,137965,115,7922,0,1,4,0


<h3>Data Integration</h3>

In [40]:
#creating lecture_id and question_id in train to merge with lectures and questions
q_id=[]
l_id=[]
i=0
for typec in train.content_type_id:
    if(typec==0):
        q_id.append(train.content_id[i])
        l_id.append(np.nan)
    else:
        q_id.append(np.nan)
        l_id.append(train.content_id[i])
    i=i+1

In [41]:
train['lecture_id']=l_id
train['question_id']=q_id

In [42]:
#merge questions and lectures with train
tqmerge=pd.merge(train,questions,how='left', on='question_id')

In [43]:
tlmerge=pd.merge(tqmerge,lectures,how='left', on='lecture_id')

In [44]:
tlmerge.columns

Index(['timestamp', 'user_id', 'content_id', 'content_type_id',
       'answered_correctly', 'prior_question_elapsed_time',
       'prior_question_had_explanation', 'lecture_id', 'question_id', 'part_x',
       'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5', 'Tag6', 'tag', 'part_y'],
      dtype='object')

In [45]:
tlmerge.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,question_id,part_x,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,tag,part_y
0,0,115,5692,0,1,0,0,,5692.0,5.0,151,,,,,,,
1,56943,115,5716,0,1,1,0,,5716.0,5.0,168,,,,,,,
2,118363,115,128,0,1,2,0,,128.0,1.0,131,149.0,92.0,,,,,
3,131167,115,7860,0,1,3,0,,7860.0,1.0,131,104.0,81.0,,,,,
4,137965,115,7922,0,1,4,0,,7922.0,1.0,131,149.0,92.0,,,,,


<b>Combining question part and lecture part into same column and also question tag 1 and lecture tag

In [47]:
tlmerge.part_x.replace(np.nan,' ',inplace=True)
tlmerge.part_y.replace(np.nan,' ',inplace=True)
tlmerge.Tag1.replace(np.nan,' ',inplace=True)
tlmerge.tag.replace(np.nan,' ',inplace=True)
tlmerge['part']= tlmerge["part_x"].astype(str) +" "+ tlmerge['part_y'].astype(str)
tlmerge['Tag1']= tlmerge["Tag1"].astype(str) +" "+ tlmerge['tag'].astype(str)

In [49]:
#drop the redundant columns
tlmerge.drop(columns=['tag','part_x','part_y','lecture_id','question_id'],axis=1, inplace=True)
tlmerge.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,part
0,0,115,5692,0,1,0,0,151,,,,,,5.0
1,56943,115,5716,0,1,1,0,168,,,,,,5.0
2,118363,115,128,0,1,2,0,131,149.0,92.0,,,,1.0
3,131167,115,7860,0,1,3,0,131,104.0,81.0,,,,1.0
4,137965,115,7922,0,1,4,0,131,149.0,92.0,,,,1.0


In [54]:
tlmerge.Tag1.replace(np.nan,-1,inplace=True)
tlmerge.Tag2.replace(np.nan,-1,inplace=True)
tlmerge.Tag3.replace(np.nan,-1,inplace=True)
tlmerge.Tag4.replace(np.nan,-1,inplace=True)
tlmerge.Tag5.replace(np.nan,-1,inplace=True)
tlmerge.Tag6.replace(np.nan,-1,inplace=True)

In [56]:
tlmerge.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,part
0,0,115,5692,0,1,0,0,151,-1,-1,-1,-1,-1,5.0
1,56943,115,5716,0,1,1,0,168,-1,-1,-1,-1,-1,5.0
2,118363,115,128,0,1,2,0,131,149,92,-1,-1,-1,1.0
3,131167,115,7860,0,1,3,0,131,104,81,-1,-1,-1,1.0
4,137965,115,7922,0,1,4,0,131,149,92,-1,-1,-1,1.0
