# Video Processing
This notebook is used for generating weekly video statistics for each person.

## Getting learners' information about video interactions from DelftX database
We use following SQL statements getting information about video interactions of all passers. The generated table is named as "CTB3365STx_1T2016_pass_videointeractions.csv"

```sql
SELECT 
	video_interaction.course_learner_id AS course_learner_id,
    video_interaction.video_id AS video_id,
    video_interaction.duration AS duration,
    video_interaction.duration_backward_seek AS duration_backward_seek,
    video_interaction.duration_forward_seek AS duration_forward_seek,
    video_interaction.duration_pause AS duration_pause,
    video_interaction.times_backward_seek AS times_backward_seek,
    video_interaction.times_forward_seek AS times_forward_seek,
    video_interaction.times_pause AS times_pause,
    video_interaction.times_speed_down AS times_speed_down,
    video_interaction.times_speed_up AS times_speed_up,
    video_interaction.start_time AS start_time,
    video_interaction.end_time AS end_time
FROM 
	DelftX2.video_interaction AS video_interaction
	JOIN DelftX2.learner_index AS learner_index
	ON learner_index.course_learner_id = video_interaction.course_learner_id
WHERE # select all the pass user
	video_interaction.course_learner_id IN (
		SELECT 
			course_learner_id 
		FROM 
			DelftX2.course_learner
		WHERE 
			DelftX2.course_learner.certificate_status <> "notpassing")
    AND learner_index.course_id = "course-v1:DelftX+CTB3365STx+1T2016"
```

In [57]:
import pandas as pd
import numpy as np
import os

## Read .csv files 
Read related csv files generated by previous SQL query and the first series of notebook (1_2_Learner_Grouping.ipynb)

In [58]:
# OutputFolder
outputfolder = "video_interaction/"
if not os.path.exists(outputfolder):
    os.makedirs(outputfolder)

# EX101x relevant csv
video_interactions = "EX101x_3T2015_pass_videointeractions.csv"
course_elements = "CourseElements_FPEX.csv"
learner_group = "EX101x_3T2015_group.csv"
totol_videolength_byweek = [1238, 1517, 1836, 1176, 1503, 2006, 1873, 1255]

# # FP101x relevant csv
# video_interactions = "FP101x_3T2015_pass_videointeractions.csv"
# course_elements = "CourseElements_FPEX.csv"
# learner_group = "FP101x_3T2015_group.csv"
# totol_videolength_byweek = [2950, 4625, 4105, 2957, 4384, 5326, 1705, 252]

# # CTB3365STx
# video_interactions = "CTB3365STx_1T2016_pass_videointeractions.csv"
# course_elements = "CTB3365STx_1T2016_chapter_split.csv"
# learner_group = "CTB3365STx_1T2016_group.csv"
# totol_videolength_byweek = [789, 8212, 5268, 6358, 5928, 8496, 8360, 457]

# # RI101x
# video_interactions = "RI101x_1T2016_pass_videointeractions.csv"
# course_elements = "RI101x_1T2016_chapter_split.csv"
# learner_group = "RI101x_1T2016_group.csv"
# totol_videolength_byweek = [3721, 4409, 5622, 5615, 2688, 3280, 4046]

In [59]:
df_video_interactions = pd.read_csv(video_interactions)
# df_video_interactions.rename(columns={'question_id': 'element_id'}, inplace=True)
print df_video_interactions.head(10)

                     course_learner_id                          video_id  \
0  course-v1:DelftX+EX101x+3T2015_2316  200ef1f3dc5f48fb91c9b2fba9e2b7fd   
1  course-v1:DelftX+EX101x+3T2015_2316  954766b2836947669134f49b298799a7   
2  course-v1:DelftX+EX101x+3T2015_2316  72826337d06640aabd57611b72e0dabb   
3  course-v1:DelftX+EX101x+3T2015_2316  bc2166d1dfa6403fada43743622abf2d   
4  course-v1:DelftX+EX101x+3T2015_2316  2570b2af894540b0b2814075f13c2a53   
5  course-v1:DelftX+EX101x+3T2015_2316  2a50d7e9c1f84b97ba062b028831adef   
6  course-v1:DelftX+EX101x+3T2015_2316  b7ebf89db2434d7eb96df0d85f380883   
7  course-v1:DelftX+EX101x+3T2015_2316  80b9a1591cbe449aba76440f36ab4484   
8  course-v1:DelftX+EX101x+3T2015_2316  107df0dc581442aeb1274c79bf950c96   
9  course-v1:DelftX+EX101x+3T2015_2316  bed1c055dbff49c0a2b60967355a4a4b   

   duration  duration_backward_seek  duration_forward_seek  duration_pause  \
0       100                   0.000                  0.000               0   
1      

## Merge and aggregate durations of video interactions by week for each learner

In [60]:
# Get course id
course_id = df_video_interactions["course_learner_id"][0].split("_")[0].split(":")[1]
# print course_id
# change the format of video id
df_video_interactions["video_id"] = df_video_interactions["video_id"].apply(lambda x: "block-v1:" + course_id + "+type@video+block@" + x)
# print df_video_interactions.head(10)
# change the column name for merging in the next step
df_video_interactions.rename(columns={'video_id': 'element_id'}, inplace=True)
print df_video_interactions.head(10)

                     course_learner_id  \
0  course-v1:DelftX+EX101x+3T2015_2316   
1  course-v1:DelftX+EX101x+3T2015_2316   
2  course-v1:DelftX+EX101x+3T2015_2316   
3  course-v1:DelftX+EX101x+3T2015_2316   
4  course-v1:DelftX+EX101x+3T2015_2316   
5  course-v1:DelftX+EX101x+3T2015_2316   
6  course-v1:DelftX+EX101x+3T2015_2316   
7  course-v1:DelftX+EX101x+3T2015_2316   
8  course-v1:DelftX+EX101x+3T2015_2316   
9  course-v1:DelftX+EX101x+3T2015_2316   

                                          element_id  duration  \
0  block-v1:DelftX+EX101x+3T2015+type@video+block...       100   
1  block-v1:DelftX+EX101x+3T2015+type@video+block...        36   
2  block-v1:DelftX+EX101x+3T2015+type@video+block...       270   
3  block-v1:DelftX+EX101x+3T2015+type@video+block...       112   
4  block-v1:DelftX+EX101x+3T2015+type@video+block...       142   
5  block-v1:DelftX+EX101x+3T2015+type@video+block...        33   
6  block-v1:DelftX+EX101x+3T2015+type@video+block...       240   
7  block-

In [61]:
# Merge the data
df_course_elements = pd.read_csv(course_elements)
df_course_elements.rename(columns={'week': 'rel_week'}, inplace=True)
# print df_course_elements.head(10)
df_video_interactions = df_video_interactions.merge(df_course_elements, on="element_id")
# print df_video_interactions.shape
# print df_video_interactions.head(10)

In [62]:
# Aggregate the data
df_video_interactions = df_video_interactions[['course_learner_id','rel_week','duration']]
df_video_interactions = df_video_interactions.groupby(['course_learner_id', 'rel_week'], as_index=False).sum()

In [63]:
# replace relevant week values
df_video_interactions['rel_week'] = df_video_interactions['rel_week'].astype(str)
replace_week_name = lambda x: 'Week_' + x
df_video_interactions['rel_week'] = df_video_interactions['rel_week'].apply(replace_week_name)
print df_video_interactions.head(10)

                        course_learner_id rel_week  duration
0  course-v1:DelftX+EX101x+3T2015_1021198   Week_1       848
1  course-v1:DelftX+EX101x+3T2015_1021198   Week_2      1105
2  course-v1:DelftX+EX101x+3T2015_1021198   Week_3      1883
3  course-v1:DelftX+EX101x+3T2015_1021198   Week_4      1191
4  course-v1:DelftX+EX101x+3T2015_1021198   Week_5      1532
5  course-v1:DelftX+EX101x+3T2015_1021198   Week_6      1495
6  course-v1:DelftX+EX101x+3T2015_1021198   Week_7      1989
7  course-v1:DelftX+EX101x+3T2015_1021198   Week_8      1193
8  course-v1:DelftX+EX101x+3T2015_1023643   Week_1       951
9  course-v1:DelftX+EX101x+3T2015_1023643   Week_2      1055


In [64]:
video_interactions_byweek = df_video_interactions.pivot(index='course_learner_id', columns='rel_week', values='duration')
video_interactions_byweek_alt = df_video_interactions.pivot(index='course_learner_id', columns='rel_week', values='duration')
video_interactions_byweek = video_interactions_byweek.fillna(0)
video_interactions_byweek_alt = video_interactions_byweek_alt.fillna(0)
print video_interactions_byweek.head(10)

rel_week                                Week_1  Week_2  Week_3  Week_4  \
course_learner_id                                                        
course-v1:DelftX+EX101x+3T2015_1021198   848.0  1105.0  1883.0  1191.0   
course-v1:DelftX+EX101x+3T2015_1023643   951.0  1055.0   556.0   591.0   
course-v1:DelftX+EX101x+3T2015_1024625  1578.0  2406.0  1733.0  1603.0   
course-v1:DelftX+EX101x+3T2015_1029720  1378.0  1408.0  1891.0  1258.0   
course-v1:DelftX+EX101x+3T2015_1030769   747.0  1617.0  2329.0  1011.0   
course-v1:DelftX+EX101x+3T2015_1034392  2000.0  4011.0  2590.0  3703.0   
course-v1:DelftX+EX101x+3T2015_10394    1526.0  1595.0  1085.0   968.0   
course-v1:DelftX+EX101x+3T2015_1053433  1175.0  1517.0  1534.0  1183.0   
course-v1:DelftX+EX101x+3T2015_1055102  1410.0  2140.0  2214.0  1341.0   
course-v1:DelftX+EX101x+3T2015_1060759  1160.0  1053.0  1179.0   913.0   

rel_week                                Week_5  Week_6  Week_7  Week_8  
course_learner_id                     

In [65]:
# Output
course_id_alt = video_interactions.split("_")[0] + "_" + video_interactions.split("_")[1]
outputfile = outputfolder + course_id_alt + "_video_duration_byweek.csv"
video_interactions_byweek.to_csv(outputfile)

In [66]:
# Normalize the value of durations by total length of videos in corresponding weeks
weeklist = video_interactions_byweek_alt.columns.values.tolist()
# print weeklist
for (week, totol_videolength) in zip(weeklist, totol_videolength_byweek):
    print week
    print totol_videolength
    video_interactions_byweek_alt[week] = video_interactions_byweek_alt[week] / totol_videolength
        
print video_interactions_byweek_alt.head(10)

Week_1
1238
Week_2
1517
Week_3
1836
Week_4
1176
Week_5
1503
Week_6
2006
Week_7
1873
Week_8
1255
rel_week                                  Week_1    Week_2    Week_3  \
course_learner_id                                                      
course-v1:DelftX+EX101x+3T2015_1021198  0.684976  0.728411  1.025599   
course-v1:DelftX+EX101x+3T2015_1023643  0.768174  0.695452  0.302832   
course-v1:DelftX+EX101x+3T2015_1024625  1.274637  1.586025  0.943900   
course-v1:DelftX+EX101x+3T2015_1029720  1.113086  0.928148  1.029956   
course-v1:DelftX+EX101x+3T2015_1030769  0.603393  1.065920  1.268519   
course-v1:DelftX+EX101x+3T2015_1034392  1.615509  2.644034  1.410675   
course-v1:DelftX+EX101x+3T2015_10394    1.232633  1.051417  0.590959   
course-v1:DelftX+EX101x+3T2015_1053433  0.949111  1.000000  0.835512   
course-v1:DelftX+EX101x+3T2015_1055102  1.138934  1.410679  1.205882   
course-v1:DelftX+EX101x+3T2015_1060759  0.936995  0.694133  0.642157   

rel_week                               

In [67]:
course_id_alt = video_interactions.split("_")[0] + "_" + video_interactions.split("_")[1]
outputfile_alt = outputfolder + course_id_alt + "_video_duration_byweek_alt.csv"
video_interactions_byweek_alt.to_csv(outputfile_alt)

## Calculate average durations and standard deviations of each group of passer 

In [70]:
# Merge with group we calculate in grade processing
df_learner_group = pd.read_csv(learner_group)
print df_learner_group.head(10)
print video_interactions_byweek_alt.head(10)
video_interactions_byweek_alt_grouped = video_interactions_byweek_alt.merge(df_learner_group, on="course_learner_id")

                        course_learner_id   group
0  course-v1:DelftX+EX101x+3T2015_1021198  Week_5
1  course-v1:DelftX+EX101x+3T2015_1023643  Week_5
2  course-v1:DelftX+EX101x+3T2015_1024625  Week_5
3  course-v1:DelftX+EX101x+3T2015_1029720  Week_6
4  course-v1:DelftX+EX101x+3T2015_1030769  Week_5
5  course-v1:DelftX+EX101x+3T2015_1034392  Week_5
6    course-v1:DelftX+EX101x+3T2015_10394  Week_7
7  course-v1:DelftX+EX101x+3T2015_1053433  Week_6
8  course-v1:DelftX+EX101x+3T2015_1055102  Week_6
9   course-v1:DelftX+EX101x+3T2015_106677  Week_5
rel_week                                  Week_1    Week_2    Week_3  \
course_learner_id                                                      
course-v1:DelftX+EX101x+3T2015_1021198  0.684976  0.728411  1.025599   
course-v1:DelftX+EX101x+3T2015_1023643  0.768174  0.695452  0.302832   
course-v1:DelftX+EX101x+3T2015_1024625  1.274637  1.586025  0.943900   
course-v1:DelftX+EX101x+3T2015_1029720  1.113086  0.928148  1.029956   
course-v1:DelftX+E

KeyError: 'course_learner_id'

In [None]:
# Generate a new dataframe whose columns are course_learner_id, duration, rel_week, group
def trans_video_byweek(df, weeklist):
    df_trans = pd.DataFrame()
    for week in weeklist:
        data = df.loc[: , ["course_learner_id", week, "group"]]
        data['rel_week'] = week
        data.rename(columns={week: 'duration'}, inplace=True)
        df_trans = df_trans.append(data)
    
    df_trans = df_trans.sort_values(['course_learner_id', "rel_week"], ascending=[True, True])
    return df_trans

In [None]:
video_interactions_distribution = trans_video_byweek(video_interactions_byweek_alt_grouped, weeklist)
# video_interactions_distribution.to_csv("FP101x_3T2015_grade_distribution.csv", index=False)
print video_interactions_distribution.head(10)