In [2]:
from data_loader import train
from preprocessing import *

In [3]:
studentInfo = train.studentInfo.copy()

In [6]:
studentInfo = preprocessing.preprocessing_studentInfo(studentInfo)

In [7]:
studentInfo.shape

(21333, 30)

In [10]:
identifiers = ['code_module', 'code_presentation', 'id_student']

In [12]:
studentInfo[identifiers].duplicated().sum()

0

so, preprocessed studentInfo is uniquely identified by `['code_module', 'code_presentation', 'id_student']`

In [16]:
studentVle = train.studentVle.copy()
vle = train.vle.copy()

In [17]:
studentVle = preprocessing.preprocessing_studentVle(vle, studentVle)

the result of merging for studentVle and vle is 
 both          7035828
right_only         75
left_only           0
Name: _merge, dtype: int64


In [19]:
identifiers = ['code_module', 'code_presentation', 'id_student', 'temperal_seg']

In [20]:
studentVle[identifiers].duplicated().sum()

0

so, the preprocessed studentVle is uniquely identified by `['code_module', 'code_presentation', 'id_student', 'temperal_seg']`

In [21]:
studentVle.head()

Unnamed: 0,code_module,code_presentation,id_student,temperal_seg,sum_click,dataplus,dualpane,externalquiz,folder,forumng,...,day 27.0,day 28.0,day 29.0,day 30.0,peak,variation,kurtosis,longest_zeros,longest_ones,entropy
0,AAA,2013J,11391.0,-0.0,147.0,0.0,0.0,0.0,0.0,0.115646,...,0.0,0.0,0.0,0.333333,0.666667,-4.016958,18.773201,24,1,0.918296
1,AAA,2013J,11391.0,1.0,277.0,0.0,0.0,0.0,0.0,0.144404,...,0.0,0.0,0.064982,0.018051,0.458484,-4.745019,15.540183,10,2,2.200965
2,AAA,2013J,11391.0,2.0,105.0,0.0,0.0,0.0,0.0,0.390476,...,0.0,0.0,0.0,0.0,0.495238,-4.665687,18.715248,6,2,2.157186
3,AAA,2013J,11391.0,3.0,58.0,0.0,0.0,0.0,0.0,0.482759,...,0.034483,0.0,0.0,0.0,0.327586,-4.889854,5.744577,8,2,2.094886
4,AAA,2013J,11391.0,4.0,41.0,0.0,0.0,0.0,0.0,0.463415,...,0.0,0.0,0.0,0.0,0.658537,-4.136472,22.26102,13,1,1.247285


In [26]:
def preprocessing_studentVle(vle, studentVle, assessments, 
                             using_testing_dates=0, user_defined_range=[]):
    
    def extract_test_dates(df, num_dates=3):
        return pd.Series({'dates': sorted(df['date'].tolist())[:num_dates]})
    
    def filter_according_to_range(df, using_testing_dates=using_testing_dates,
                                  user_defined_range=user_defined_range):
        if user_defined_range:
            start, end = user_defined_range
        else:
            start = 0 if using_test_dates == 0 else df['dates'][using_test_dates - 1]
            end = df['dates'][using_testing_dates]

        return start <= df['date'] <= end
    
    vle = vle.copy()
    studentVle = studentVle.copy()
    assessments = assessments.copy()
    
    # merge wiht assessments dates
    identifiers = ['code_module', 'code_presentation']
    assessments = assessments.groupby(identifiers).apply(extract_test_dates)
    assessments.reset_index(inplace=True)
    studentVle = studentVle.merge(assessments, on=identifiers, how='left')

    # filter dates
    mask = studentVle.apply(filter_according_to_range, axis=1)
    studentVle = studentVle[mask].copy()
    
    # merge two
    studentVle = studentVle.merge(
        vle,
        on=['code_module', 'code_presentation', 'id_site'],
        how='left', indicator=True)
    print('the result of merging for studentVle and vle is \n {}'
          .format(studentVle['_merge'].value_counts()))
    studentVle.drop('_merge', axis=1, inplace=True)

    # create identifiers
    identifiers = ['code_module', 'code_presentation', 'id_student']

    # effort
    effort = studentVle.groupby(identifiers)[
        'sum_click'].agg('sum').reset_index()

    # marginal distribution on resource
    resource_dist = studentVle.groupby(identifiers + ['activity_type'])[
        'sum_click'].agg('sum').unstack(-1)
    resource_dist.columns = list(resource_dist.columns)
    resource_dist = resource_dist.div(resource_dist.sum(axis=1), axis=0)
    assert np.allclose(resource_dist.sum(axis=1), 1)
    assert resource_dist.sum(axis=1).shape[0] == resource_dist.shape[0]
    resource_dist.reset_index(inplace=True)
    resource_dist.fillna(0, inplace=True)

    # marginal distribution on time:
    studentVle['temperal_seq'] = studentVle['date'] - (
        studentVle['temperal_seg'] - 1) * 30
    temporal_dist = studentVle.groupby(
        identifiers + ['temperal_seq'])['sum_click'].agg('sum').unstack(-1)
    day_seq = ['day ' + str(d) for d in list(temporal_dist.columns)]
    temporal_dist.columns = day_seq
    temporal_dist.fillna(0, inplace=True)
    temporal_dist = temporal_dist.div(temporal_dist.sum(axis=1), axis=0)
    assert np.allclose(temporal_dist.sum(axis=1), 1)
    assert temporal_dist.sum(axis=1).shape[0] == temporal_dist.shape[0]
    temporal_dist.reset_index(inplace=True)

    # build non-linear summary features for temporal_dist
    temporal_dist['peak'] = temporal_dist[day_seq].max(axis=1)
    temporal_dist['variation'] = temporal_dist[day_seq].var(axis=1).apply(math.log)
    temporal_dist['kurtosis'] = temporal_dist[day_seq].kurtosis(axis=1)
    temporal_dist['longest_zeros'] = temporal_dist[day_seq].apply(lambda x: preprocessing.longest_run(x, 0), axis=1)
    temporal_dist['longest_ones'] = temporal_dist[day_seq].apply(lambda x: preprocessing.longest_run(x, 1), axis=1)
    temporal_dist['entropy'] = temporal_dist[day_seq].apply(lambda x: scipy.stats.entropy(x, base=2),axis=1)

    engagement = effort.merge(resource_dist, on=identifiers, how='left')
    engagement = engagement.merge(temporal_dist, on=identifiers, how='left')
    assert engagement[identifiers].duplicated().sum() == 0

    # ignore temperal_seg larger than 8
    return engagement[(engagement['temperal_seg'] < 9)]

In [63]:
vle = train.vle.copy()
studentVle = train.studentVle.copy()
assessments = train.assessments.copy()

In [64]:
using_testing_dates = 0
user_defined_range = []

In [65]:
def extract_test_dates(df, num_dates=3):
    return pd.Series({'dates': sorted(df['date'].tolist())[:num_dates]})

def extract_timestamps_and_weights(df):
    return pd.Series({
        'timestamps': df['date'].tolist(),
        'weights': df['sum_click'].tolist()})

def filter_according_to_range(df, using_testing_dates=using_testing_dates,
                              user_defined_range=user_defined_range):
    if user_defined_range:
        start, end = user_defined_range
    else:
        start = 0 if using_testing_dates == 0 else df['dates'][using_testing_dates - 1]
        end = df['dates'][using_testing_dates]

    return start <= df['date'] <= end


vle = vle.copy()
studentVle = studentVle.copy()
assessments = assessments.copy()

# merge wiht assessments dates
identifiers = ['code_module', 'code_presentation']
assessments = assessments.groupby(identifiers).apply(extract_test_dates)
assessments.reset_index(inplace=True)
studentVle = studentVle.merge(assessments, on=identifiers, how='left')

# filter dates
mask = studentVle.apply(filter_according_to_range, axis=1)
studentVle = studentVle[mask].copy()

# merge two
studentVle = studentVle.merge(
    vle,
    on=['code_module', 'code_presentation', 'id_site'],
    how='left', indicator=True)
print('the result of merging for studentVle and vle is \n {}'
      .format(studentVle['_merge'].value_counts()))
studentVle.drop('_merge', axis=1, inplace=True)

# create identifiers
identifiers = ['code_module', 'code_presentation', 'id_student']

# effort
effort = studentVle.groupby(identifiers)[
    'sum_click'].agg('sum').reset_index()

# marginal distribution on resource
resource_dist = studentVle.groupby(identifiers + ['activity_type'])[
    'sum_click'].agg('sum').unstack(-1)
resource_dist.columns = list(resource_dist.columns)
resource_dist = resource_dist.div(resource_dist.sum(axis=1), axis=0)
assert np.allclose(resource_dist.sum(axis=1), 1)
assert resource_dist.sum(axis=1).shape[0] == resource_dist.shape[0]
resource_dist.reset_index(inplace=True)
resource_dist.fillna(0, inplace=True)

# 
# apply function
identifiers = ['code_module', 'code_presentation', 'id_student', 'dates']
studentVle = studentVle.groupby(identifiers + ['date'])['sum_click'].agg('sum').reset_index()
studentVle.sort_values(identifiers, inplace=True)
studentVle = studentVle.groupby(
    identifiers).apply(extract_timestamps_and_weights).reset_index()

the result of merging for studentVle and vle is 
 both          1239046
right_only          0
left_only           0
Name: _merge, dtype: int64


TypeError: unhashable type: 'list'

In [None]:
def get_statistics(df):
    ts = df['timestamps']
    

In [None]:
studentVle.head()

In [52]:
resource_dist

Unnamed: 0,code_module,code_presentation,id_student,dataplus,dualpane,externalquiz,forumng,glossary,homepage,oucollaborate,oucontent,ouelluminate,ouwiki,page,questionnaire,quiz,resource,sharedsubpage,subpage,url
0,AAA,2013J,11391,0.0,0.0,0.0,0.174917,0.000000,0.115512,0.000000,0.643564,0.0,0.0,0.0,0.0,0.0,0.029703,0.0,0.033003,0.003300
1,AAA,2013J,28400,0.0,0.0,0.0,0.413669,0.000000,0.230216,0.000000,0.197842,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.100719,0.057554
2,AAA,2013J,30268,0.0,0.0,0.0,0.486034,0.000000,0.150838,0.000000,0.290503,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.055866,0.016760
3,AAA,2013J,31604,0.0,0.0,0.0,0.323636,0.000000,0.200000,0.000000,0.309091,0.0,0.0,0.0,0.0,0.0,0.003636,0.0,0.112727,0.050909
4,AAA,2013J,32885,0.0,0.0,0.0,0.308824,0.000000,0.176471,0.000000,0.500000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.014706,0.000000
5,AAA,2013J,38053,0.0,0.0,0.0,0.497561,0.000000,0.251220,0.000000,0.197561,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.034146,0.019512
6,AAA,2013J,45462,0.0,0.0,0.0,0.090909,0.000000,0.159091,0.007576,0.632576,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.087121,0.022727
7,AAA,2013J,45642,0.0,0.0,0.0,0.385965,0.000000,0.168421,0.003509,0.291228,0.0,0.0,0.0,0.0,0.0,0.003509,0.0,0.105263,0.042105
8,AAA,2013J,52130,0.0,0.0,0.0,0.238095,0.000000,0.226190,0.007937,0.380952,0.0,0.0,0.0,0.0,0.0,0.007937,0.0,0.115079,0.023810
9,AAA,2013J,53025,0.0,0.0,0.0,0.628125,0.000000,0.100000,0.000000,0.196875,0.0,0.0,0.0,0.0,0.0,0.003125,0.0,0.046875,0.025000
