# Video Processing
This notebook is used for generating weekly video statistics for each person.

## Getting learners' information about video interactions from DelftX database
We use following SQL statements getting information about video interactions of all passers. The generated table is named as "CTB3365STx_1T2016_pass_videointeractions.csv"

```sql
SELECT 
	video_interaction.course_learner_id AS course_learner_id,
    video_interaction.video_id AS video_id,
    video_interaction.duration AS duration,
    video_interaction.duration_backward_seek AS duration_backward_seek,
    video_interaction.duration_forward_seek AS duration_forward_seek,
    video_interaction.duration_pause AS duration_pause,
    video_interaction.times_backward_seek AS times_backward_seek,
    video_interaction.times_forward_seek AS times_forward_seek,
    video_interaction.times_pause AS times_pause,
    video_interaction.times_speed_down AS times_speed_down,
    video_interaction.times_speed_up AS times_speed_up,
    video_interaction.start_time AS start_time,
    video_interaction.end_time AS end_time
FROM 
	DelftX2.video_interaction AS video_interaction
	JOIN DelftX2.learner_index AS learner_index
	ON learner_index.course_learner_id = video_interaction.course_learner_id
WHERE # select all the pass user
	video_interaction.course_learner_id IN (
		SELECT 
			course_learner_id 
		FROM 
			DelftX2.course_learner
		WHERE 
			DelftX2.course_learner.certificate_status <> "notpassing")
    AND learner_index.course_id = "course-v1:DelftX+CTB3365STx+1T2016"
```

In [159]:
import pandas as pd
import numpy as np
import os

## Read .csv files 
Read related csv files generated by previous SQL query and the first series of notebook (1_2_Learner_Grouping.ipynb)

In [160]:
# OutputFolder
outputfolder = "video_interaction/"
if not os.path.exists(outputfolder):
    os.makedirs(outputfolder)

# # EX101x relevant csv
# video_interactions = "EX101x_3T2015_pass_videointeractions.csv"
# course_elements = "CourseElements_FPEX.csv"
# learner_group = "EX101x_3T2015_group.csv"
# totol_videolength_byweek = [1238, 1517, 1836, 1176, 1503, 2006, 1873, 1255]

# # FP101x relevant csv
# video_interactions = "FP101x_3T2015_pass_videointeractions.csv"
# course_elements = "CourseElements_FPEX.csv"
# learner_group = "FP101x_3T2015_group.csv"
# totol_videolength_byweek = [2950, 4625, 4105, 2957, 4384, 5326, 1705, 252]

# # CTB3365STx
# video_interactions = "CTB3365STx_1T2016_pass_videointeractions.csv"
# course_elements = "CTB3365STx_1T2016_chapter_split.csv"
# learner_group = "CTB3365STx_1T2016_group.csv"
# totol_videolength_byweek = [789, 8212, 5268, 6358, 5928, 8496, 8360, 457]

# RI101x
video_interactions = "RI101x_1T2016_pass_videointeractions.csv"
course_elements = "RI101x_1T2016_chapter_split.csv"
learner_group = "RI101x_1T2016_group.csv"
totol_videolength_byweek = [3721, 4409, 5622, 5615, 2688, 3280, 4046]

In [161]:
df_video_interactions = pd.read_csv(video_interactions)
# df_video_interactions.rename(columns={'question_id': 'element_id'}, inplace=True)
print df_video_interactions.head(10)

                       course_learner_id                          video_id  \
0  course-v1:DelftX+RI101x+1T2016_167415  a232c214e0d14d8c88dcacbf94b078e5   
1  course-v1:DelftX+RI101x+1T2016_167415  0c29fa2e376d4119bf171683c1cf5007   
2  course-v1:DelftX+RI101x+1T2016_167415  24668d411e674b3292df8b318aadbe99   
3  course-v1:DelftX+RI101x+1T2016_167415  ffa477ea558440eda9d562a4cd7e43dc   
4  course-v1:DelftX+RI101x+1T2016_167415  44ccdd52a74f4c78b49a511a4e2be782   
5  course-v1:DelftX+RI101x+1T2016_167415  2b4570ac1195430eae8cad2c099d2abe   
6  course-v1:DelftX+RI101x+1T2016_167415  5f0f2d6bea004590bb8f202a4110e2e1   
7  course-v1:DelftX+RI101x+1T2016_167415  2cab8f18a3084b989a2cd8e516db02ac   
8  course-v1:DelftX+RI101x+1T2016_167415  44ccdd52a74f4c78b49a511a4e2be782   
9  course-v1:DelftX+RI101x+1T2016_167415  667472df86cd4f9aa812c82102b4c1ca   

   duration  duration_backward_seek  duration_forward_seek  duration_pause  \
0        23                     0.0                    0.0     

## Merge and aggregate durations of video interactions by week for each learner

In [162]:
# Get course id
course_id = df_video_interactions["course_learner_id"][0].split("_")[0].split(":")[1]
print course_id
# change the format of video id
df_video_interactions["video_id"] = df_video_interactions["video_id"].astype(str)
df_video_interactions["video_id"] = df_video_interactions["video_id"].apply(lambda x: "block-v1:" + course_id + "+type@video+block@" + x)
# print df_video_interactions.head(10)
# change the column name for merging in the next step
df_video_interactions.rename(columns={'video_id': 'element_id'}, inplace=True)
print df_video_interactions.head(10)

DelftX+RI101x+1T2016
                       course_learner_id  \
0  course-v1:DelftX+RI101x+1T2016_167415   
1  course-v1:DelftX+RI101x+1T2016_167415   
2  course-v1:DelftX+RI101x+1T2016_167415   
3  course-v1:DelftX+RI101x+1T2016_167415   
4  course-v1:DelftX+RI101x+1T2016_167415   
5  course-v1:DelftX+RI101x+1T2016_167415   
6  course-v1:DelftX+RI101x+1T2016_167415   
7  course-v1:DelftX+RI101x+1T2016_167415   
8  course-v1:DelftX+RI101x+1T2016_167415   
9  course-v1:DelftX+RI101x+1T2016_167415   

                                          element_id  duration  \
0  block-v1:DelftX+RI101x+1T2016+type@video+block...        23   
1  block-v1:DelftX+RI101x+1T2016+type@video+block...       107   
2  block-v1:DelftX+RI101x+1T2016+type@video+block...       557   
3  block-v1:DelftX+RI101x+1T2016+type@video+block...       379   
4  block-v1:DelftX+RI101x+1T2016+type@video+block...        73   
5  block-v1:DelftX+RI101x+1T2016+type@video+block...       194   
6  block-v1:DelftX+RI101x+1T2016

In [163]:
# Merge the data
df_course_elements = pd.read_csv(course_elements)
# Set the column name to the same
df_course_elements.rename(columns={'week': 'rel_week'}, inplace=True)
df_course_elements.rename(columns={'relevant_week': 'rel_week'}, inplace=True)
# print df_course_elements.head(10)
df_video_interactions = df_video_interactions.merge(df_course_elements, on="element_id")
# print df_video_interactions.shape
print df_video_interactions.head(10)

                        course_learner_id  \
0   course-v1:DelftX+RI101x+1T2016_167415   
1   course-v1:DelftX+RI101x+1T2016_169086   
2  course-v1:DelftX+RI101x+1T2016_1048861   
3  course-v1:DelftX+RI101x+1T2016_1048861   
4  course-v1:DelftX+RI101x+1T2016_1048861   
5  course-v1:DelftX+RI101x+1T2016_1048861   
6  course-v1:DelftX+RI101x+1T2016_1048861   
7  course-v1:DelftX+RI101x+1T2016_1906592   
8  course-v1:DelftX+RI101x+1T2016_1906592   
9  course-v1:DelftX+RI101x+1T2016_1954245   

                                          element_id  duration  \
0  block-v1:DelftX+RI101x+1T2016+type@video+block...        23   
1  block-v1:DelftX+RI101x+1T2016+type@video+block...       656   
2  block-v1:DelftX+RI101x+1T2016+type@video+block...       693   
3  block-v1:DelftX+RI101x+1T2016+type@video+block...        12   
4  block-v1:DelftX+RI101x+1T2016+type@video+block...       195   
5  block-v1:DelftX+RI101x+1T2016+type@video+block...       149   
6  block-v1:DelftX+RI101x+1T2016+type@vide

In [164]:
# Aggregate the data
df_video_interactions = df_video_interactions[['course_learner_id','rel_week','duration']]
df_video_interactions = df_video_interactions.groupby(['course_learner_id', 'rel_week'], as_index=False).sum()

In [165]:
# replace relevant week values
df_video_interactions['rel_week'] = df_video_interactions['rel_week'].astype(str)
replace_week_name = lambda x: 'Week_' + x
df_video_interactions['rel_week'] = df_video_interactions['rel_week'].apply(replace_week_name)
print df_video_interactions.head(10)

                         course_learner_id rel_week  duration
0  course-v1:DelftX+RI101x+1T2016_10047608   Week_1      4476
1  course-v1:DelftX+RI101x+1T2016_10047608   Week_2      3689
2  course-v1:DelftX+RI101x+1T2016_10047608   Week_3      4553
3  course-v1:DelftX+RI101x+1T2016_10047608   Week_4      3984
4  course-v1:DelftX+RI101x+1T2016_10047608   Week_5      2298
5  course-v1:DelftX+RI101x+1T2016_10047608   Week_6      2028
6  course-v1:DelftX+RI101x+1T2016_10047608   Week_7      1959
7  course-v1:DelftX+RI101x+1T2016_10143370   Week_1      3950
8  course-v1:DelftX+RI101x+1T2016_10143370   Week_2      3097
9  course-v1:DelftX+RI101x+1T2016_10143370   Week_3      3915


In [166]:
video_interactions_byweek = df_video_interactions.pivot(index='course_learner_id', columns='rel_week', values='duration')
video_interactions_byweek_alt = df_video_interactions.pivot(index='course_learner_id', columns='rel_week', values='duration')
video_interactions_byweek = video_interactions_byweek.fillna(0)
video_interactions_byweek_alt = video_interactions_byweek_alt.fillna(0)
print video_interactions_byweek.head(10)

rel_week                                 Week_1  Week_2  Week_3  Week_4  \
course_learner_id                                                         
course-v1:DelftX+RI101x+1T2016_10047608  4476.0  3689.0  4553.0  3984.0   
course-v1:DelftX+RI101x+1T2016_10143370  3950.0  3097.0  3915.0  3130.0   
course-v1:DelftX+RI101x+1T2016_10167505  2751.0  1129.0  4075.0  4067.0   
course-v1:DelftX+RI101x+1T2016_10192342  4379.0  5372.0  3925.0   756.0   
course-v1:DelftX+RI101x+1T2016_10215694  6314.0  4335.0  4713.0  5230.0   
course-v1:DelftX+RI101x+1T2016_10234721  3277.0  6716.0  5508.0  5567.0   
course-v1:DelftX+RI101x+1T2016_10252808  1808.0  3201.0     0.0     0.0   
course-v1:DelftX+RI101x+1T2016_10263285  3629.0  5868.0  4648.0  3861.0   
course-v1:DelftX+RI101x+1T2016_10282364  3419.0  5277.0  5467.0  4995.0   
course-v1:DelftX+RI101x+1T2016_10306096   254.0     0.0    20.0     0.0   

rel_week                                 Week_5  Week_6  Week_7  
course_learner_id                

In [167]:
# Output
course_id_alt = video_interactions.split("_")[0] + "_" + video_interactions.split("_")[1]
outputfile = outputfolder + course_id_alt + "_video_duration_byweek.csv"
video_interactions_byweek.to_csv(outputfile)

In [168]:
# Normalize the value of durations by total length of videos in corresponding weeks
weeklist = video_interactions_byweek_alt.columns.values.tolist()
# print weeklist
for (week, totol_videolength) in zip(weeklist, totol_videolength_byweek):
    print week
    print totol_videolength
    video_interactions_byweek_alt[week] = video_interactions_byweek_alt[week] / totol_videolength
        
print video_interactions_byweek_alt.head(10)

Week_1
3721
Week_2
4409
Week_3
5622
Week_4
5615
Week_5
2688
Week_6
3280
Week_7
4046
rel_week                                   Week_1    Week_2    Week_3  \
course_learner_id                                                       
course-v1:DelftX+RI101x+1T2016_10047608  1.202902  0.836698  0.809854   
course-v1:DelftX+RI101x+1T2016_10143370  1.061543  0.702427  0.696371   
course-v1:DelftX+RI101x+1T2016_10167505  0.739317  0.256067  0.724831   
course-v1:DelftX+RI101x+1T2016_10192342  1.176834  1.218417  0.698150   
course-v1:DelftX+RI101x+1T2016_10215694  1.696856  0.983216  0.838314   
course-v1:DelftX+RI101x+1T2016_10234721  0.880677  1.523248  0.979723   
course-v1:DelftX+RI101x+1T2016_10252808  0.485891  0.726015  0.000000   
course-v1:DelftX+RI101x+1T2016_10263285  0.975275  1.330914  0.826752   
course-v1:DelftX+RI101x+1T2016_10282364  0.918839  1.196870  0.972430   
course-v1:DelftX+RI101x+1T2016_10306096  0.068261  0.000000  0.003557   

rel_week                               

In [169]:
course_id_alt = video_interactions.split("_")[0] + "_" + video_interactions.split("_")[1]
outputfile_alt = outputfolder + course_id_alt + "_video_duration_byweek_alt.csv"
video_interactions_byweek_alt.to_csv(outputfile_alt)

## Calculate average durations and standard deviations of each group of passer 

In [170]:
# Merge with group we calculate in grade processing
df_learner_group = pd.read_csv(learner_group)
print df_learner_group.head(10)
print video_interactions_byweek_alt.head(10)
video_interactions_byweek_alt.reset_index(level=0, inplace=True)
video_interactions_byweek_alt_grouped = video_interactions_byweek_alt.merge(df_learner_group, on="course_learner_id")


                         course_learner_id   group
0  course-v1:DelftX+RI101x+1T2016_10047608  Week_5
1  course-v1:DelftX+RI101x+1T2016_10143370  Week_6
2  course-v1:DelftX+RI101x+1T2016_10167505  Week_5
3  course-v1:DelftX+RI101x+1T2016_10192342  Week_6
4  course-v1:DelftX+RI101x+1T2016_10215694  Week_5
5  course-v1:DelftX+RI101x+1T2016_10234721  Week_5
6  course-v1:DelftX+RI101x+1T2016_10252808  Week_7
7  course-v1:DelftX+RI101x+1T2016_10263285  Week_5
8  course-v1:DelftX+RI101x+1T2016_10282364  Week_5
9  course-v1:DelftX+RI101x+1T2016_10306096  Week_6
rel_week                                   Week_1    Week_2    Week_3  \
course_learner_id                                                       
course-v1:DelftX+RI101x+1T2016_10047608  1.202902  0.836698  0.809854   
course-v1:DelftX+RI101x+1T2016_10143370  1.061543  0.702427  0.696371   
course-v1:DelftX+RI101x+1T2016_10167505  0.739317  0.256067  0.724831   
course-v1:DelftX+RI101x+1T2016_10192342  1.176834  1.218417  0.698150   
c

In [171]:
# Generate a new dataframe whose columns are course_learner_id, duration, rel_week, group
def trans_video_byweek(df, weeklist):
    df_trans = pd.DataFrame()
    for week in weeklist:
        data = df.loc[: , ["course_learner_id", week, "group"]]
        data['rel_week'] = week
        data.rename(columns={week: 'duration'}, inplace=True)
        df_trans = df_trans.append(data)
    
    df_trans = df_trans.sort_values(['course_learner_id', "rel_week"], ascending=[True, True])
    return df_trans

In [172]:
video_interactions_distribution = trans_video_byweek(video_interactions_byweek_alt_grouped, weeklist)
video_interactions_distribution.reset_index()
print video_interactions_distribution.head(100)
outputfile_video_distribution = outputfolder + course_id_alt + "_video_duration_distribution.csv"
video_interactions_distribution.to_csv(outputfile_video_distribution, index=False)

rel_week                        course_learner_id  duration   group rel_week
0         course-v1:DelftX+RI101x+1T2016_10047608  1.202902  Week_5   Week_1
0         course-v1:DelftX+RI101x+1T2016_10047608  0.836698  Week_5   Week_2
0         course-v1:DelftX+RI101x+1T2016_10047608  0.809854  Week_5   Week_3
0         course-v1:DelftX+RI101x+1T2016_10047608  0.709528  Week_5   Week_4
0         course-v1:DelftX+RI101x+1T2016_10047608  0.854911  Week_5   Week_5
0         course-v1:DelftX+RI101x+1T2016_10047608  0.618293  Week_5   Week_6
0         course-v1:DelftX+RI101x+1T2016_10047608  0.484182  Week_5   Week_7
1         course-v1:DelftX+RI101x+1T2016_10143370  1.061543  Week_6   Week_1
1         course-v1:DelftX+RI101x+1T2016_10143370  0.702427  Week_6   Week_2
1         course-v1:DelftX+RI101x+1T2016_10143370  0.696371  Week_6   Week_3
1         course-v1:DelftX+RI101x+1T2016_10143370  0.557435  Week_6   Week_4
1         course-v1:DelftX+RI101x+1T2016_10143370  0.996280  Week_6   Week_5

In [173]:
print video_interactions_distribution.columns.values.tolist()

['course_learner_id', 'duration', 'group', 'rel_week']


## Calculate mean and std by group and relevant weeks

In [174]:
video_interactions_avg = video_interactions_distribution[["duration","group","rel_week"]].groupby(['group', 'rel_week'], as_index=False).mean()
video_interactions_avg.rename(columns={'duration': 'avgduration'}, inplace=True)
# print video_interactions_avg.head(10)

video_interactions_std = video_interactions_distribution[["duration","group","rel_week"]].groupby(['group', 'rel_week'], as_index=False).agg(lambda x: np.std(x, ddof=1))
video_interactions_std.rename(columns={'duration': 'stdduration'}, inplace=True)
# print video_interactions_std.head(10)

video_interactions_group_byweek = video_interactions_avg.merge(video_interactions_std, on=["group", "rel_week"])
print video_interactions_group_byweek.head(100)

outputfile_video_group_byweek = outputfolder + course_id_alt + "_video_duration_avg_groupbyweek.csv"
video_interactions_group_byweek.to_csv(outputfile_video_group_byweek, index=False)

rel_week   group rel_week  avgduration  stdduration
0         Week_4   Week_1     0.449461     0.373716
1         Week_4   Week_2     0.466999     0.435750
2         Week_4   Week_3     0.264418     0.353580
3         Week_4   Week_4     0.166142     0.352895
4         Week_4   Week_5     0.287078     0.379109
5         Week_4   Week_6     0.150610     0.298287
6         Week_4   Week_7     0.228209     0.315795
7         Week_5   Week_1     0.682951     0.327228
8         Week_5   Week_2     0.672525     0.392354
9         Week_5   Week_3     0.480367     0.325044
10        Week_5   Week_4     0.472009     0.331770
11        Week_5   Week_5     0.689488     0.435188
12        Week_5   Week_6     0.547596     0.378753
13        Week_5   Week_7     0.449033     0.412150
14        Week_6   Week_1     0.625717     0.653373
15        Week_6   Week_2     0.465412     0.537502
16        Week_6   Week_3     0.453272     0.556572
17        Week_6   Week_4     0.227542     0.289485
18        We