### Importing Required Packages

In [1]:
import os
import pandas as pd
import matplotlib as plt
import numpy as np

### Read In Lecture Data

In [42]:
lect_data = pd.read_csv("Data/contents/lectures.csv")
print(lect_data.shape)
lect_data.head()

(1021, 5)


Unnamed: 0,lecture_id,part,tags,video_length,deployed_at
0,l520,5,142,-1,-1
1,l592,6,142,-1,-1
2,l1259,1,222,359000,1570424729123
3,l1260,1,220,487000,1570424738105
4,l1261,1,221,441000,1570424743162


#### Removing Non-valid Values from the Dataset

In [59]:
# Data Featuring on the lect_data
lect_data = lect_data.loc[lect_data['tags'] != -1]
lect_data = lect_data.loc[:, ['lecture_id', 'part', 'tags']]
print(lect_data.shape)
lect_data.head()

(584, 3)


Unnamed: 0,lecture_id,part,tags
0,l520,5,142
1,l592,6,142
2,l1259,1,222
3,l1260,1,220
4,l1261,1,221


In [60]:
# Largest and Smallest Tag
# - what we will base our future loop on for matching question_id's with tags
largest_tag = lect_data['tags'].max()
smallest_tag = lect_data['tags'].min()
print(f"Largest tag: {largest_tag}\n")
print(f"Smallest tag: {smallest_tag}\n")

Largest tag: 298

Smallest tag: 1



### Reading in Question Data

In [53]:
question_data = pd.read_csv("Data/contents/questions.csv")
question_data.columns = question_data.columns.str.strip() # removing white space from column heads
print(question_data.shape)
question_data.head()

(13169, 7)


Unnamed: 0,question_id,bundle_id,explanation_id,correct_answer,part,tags,deployed_at
0,q1,b1,e1,b,1,1;2;179;181,1558090000000.0
1,q2,b2,e2,a,1,15;2;182,1558090000000.0
2,q3,b3,e3,b,1,14;2;179;183,1558090000000.0
3,q4,b4,e4,b,1,9;2;179;184,1558090000000.0
4,q5,b5,e5,c,1,8;2;179;181,1558090000000.0


In [54]:
question_data = question_data.loc[:, ['question_id', 'explanation_id', 'part','tags']]
question_data.head()

Unnamed: 0,question_id,explanation_id,part,tags
0,q1,e1,1,1;2;179;181
1,q2,e2,1,15;2;182
2,q3,e3,1,14;2;179;183
3,q4,e4,1,9;2;179;184
4,q5,e5,1,8;2;179;181


## Combining Data Sets
#### Steps:
    1. loop through each question_data and create a new dataframe
        - the new csv will separate each tag into its own row index and have a list of questions associated with the tag
        - tag: 2 -> question_id: q1, a2, etc...
    2.  The new dataframe can be used in comparison with what the students studied and their answer on the question
    
    Goal:
    Find the average question score for each tag
        - might be able to provide more use info for the model to learn on since it could show how some 
          tags have more difficult questions then others

#### Reading in combined_data Set

In [55]:
combined_data = pd.read_csv("Data/combined_dataset.csv")
combined_data.head()

Unnamed: 0,student_id,question_id,bundle_id,tags,elapsed_time,correct
0,1,7,7,11;7;179;183,26000,1.0
1,1,7,7,11;7;179;183,37000,0.0
2,1,10,10,17;7;182,28000,0.0
3,1,10,10,17;7;182,26000,0.0
4,1,11,11,15;2;181,42000,0.0


#### Step 1: Combining the tags with the question_ids

In [70]:
# lec_ques = {
#     'tag':[] ,
#     'question_ids':[]
# }
tags = list(range(smallest_tag, largest_tag))
question_ids = [None] * len(tags)
lec_ques = pd.DataFrame({'tag': tags, 'question_ids':question_ids})

lec_ques.head()

Unnamed: 0,tag,question_ids
0,1,
1,2,
2,3,
3,4,
4,5,
