In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import math

%matplotlib inline

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from scipy import sparse
from scipy.linalg import sqrtm 


import networkx as nx


In [3]:
# from google.colab import drive
# drive.mount('/content/drive/')

# %cd /content/drive/My Drive/ACT_MOOC/


In [2]:
actions = pd.read_csv('mooc_actions.tsv', sep='\t')
actions.head()
# G(0, 1, 0) = exp(-gamma*6.0)
# G(node_start, node_end, time_jump_num) = scaled_time

actions['TARGETID'] = actions['TARGETID'].astype('int16')
actions['TIMESTAMP'] = actions['TIMESTAMP'].astype('int32')

actions.head()
print (actions.dtypes)

ACTIONID     int64
USERID       int64
TARGETID     int16
TIMESTAMP    int32
dtype: object


### Whole dataset

In [3]:
only_100_actions = False

In [4]:
targetid_time_df = actions.groupby('USERID')['TARGETID'].apply(list).reset_index(name="TARGETID")

times = actions.groupby('USERID')['TIMESTAMP'].apply(list)
        
targetid_time_df["TIMESTAMP"] = times

# removing any actions if there are more than 100 actions taken by a user 
# leaving only the last 100 actions

if only_100_actions:
    for index, row in targetid_time_df.iterrows():
        if len(row['TARGETID']) > 100: # if >100 user actions
    #         print(row['TARGETID'])
    #         print("> 100 actions undertaken")
            new_TARGETID = row['TARGETID'][-100:] # limits to last 100 actions
            #row['TIMESTAMP'][-100:]

            new_TIMESTAMP = row['TIMESTAMP'][-100:]


            # needs to replace cell in panda
            targetid_time_df.TARGETID.iloc[index] = new_TARGETID
            targetid_time_df.TIMESTAMP.iloc[index] = new_TIMESTAMP


    times = targetid_time_df['TIMESTAMP']


# print("\n")
# print(targetid_time_df['TARGETID'][2]) # cell with >100 actions
# print(len(targetid_time_df['TARGETID'][2]))
targetid_time_df.head()

Unnamed: 0,USERID,TARGETID,TIMESTAMP
0,0,"[0, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 2, ...","[0, 6, 41, 49, 51, 55, 59, 62, 65, 113, 226, 9..."
1,1,"[10, 1, 2, 7, 0, 11, 12, 11, 12, 12, 12, 12, 5...","[7262, 7266, 7273, 7289, 7299, 7319, 7839, 784..."
2,2,"[1, 10, 3, 13, 8, 7, 3, 13, 4, 15, 5, 16, 5, 6...","[37868, 78761, 78894, 82108, 105180, 105187, 1..."
3,3,"[1, 10, 3, 13, 8, 3]","[37953, 38113, 38126, 38246, 776948, 776953]"
4,4,"[1, 3, 4, 13, 8, 5, 5, 16, 16]","[37969, 38018, 38724, 38725, 38736, 38753, 396..."


##### Sorting out the timestamps:
* scaling the timestamps so they all begin at 0 seconds
* finding the most actions taken

In [5]:
# scaling the timestamps:
base_nums = []
for time_list in times:
    base_num = time_list[0]
    base_nums.append(base_num)
    

new_timestamps = []
for i, time_list in enumerate(times):
    #print(time_list)
    new_times = []
    new_timestamps.append(new_times)
    
    for time in time_list:
        #print(time)
        time_new = time - base_nums[i]
        #print(f"Time: {time}, Base number: {base_nums[i]}")
        new_times.append(time_new)
    
targetid_time_df["SCALED_TIMESTAMP"] = new_timestamps

targetid_time_df.head()

Unnamed: 0,USERID,TARGETID,TIMESTAMP,SCALED_TIMESTAMP
0,0,"[0, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 2, ...","[0, 6, 41, 49, 51, 55, 59, 62, 65, 113, 226, 9...","[0, 6, 41, 49, 51, 55, 59, 62, 65, 113, 226, 9..."
1,1,"[10, 1, 2, 7, 0, 11, 12, 11, 12, 12, 12, 12, 5...","[7262, 7266, 7273, 7289, 7299, 7319, 7839, 784...","[0, 4, 11, 27, 37, 57, 577, 584, 64802, 65203,..."
2,2,"[1, 10, 3, 13, 8, 7, 3, 13, 4, 15, 5, 16, 5, 6...","[37868, 78761, 78894, 82108, 105180, 105187, 1...","[0, 40893, 41026, 44240, 67312, 67319, 67348, ..."
3,3,"[1, 10, 3, 13, 8, 3]","[37953, 38113, 38126, 38246, 776948, 776953]","[0, 160, 173, 293, 738995, 739000]"
4,4,"[1, 3, 4, 13, 8, 5, 5, 16, 16]","[37969, 38018, 38724, 38725, 38736, 38753, 396...","[0, 49, 755, 756, 767, 784, 1661, 1661, 1913]"


In [6]:
targetid_time_df =targetid_time_df.drop(['TIMESTAMP'], axis=1)

targetid_time_df.head()

Unnamed: 0,USERID,TARGETID,SCALED_TIMESTAMP
0,0,"[0, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 2, ...","[0, 6, 41, 49, 51, 55, 59, 62, 65, 113, 226, 9..."
1,1,"[10, 1, 2, 7, 0, 11, 12, 11, 12, 12, 12, 12, 5...","[0, 4, 11, 27, 37, 57, 577, 584, 64802, 65203,..."
2,2,"[1, 10, 3, 13, 8, 7, 3, 13, 4, 15, 5, 16, 5, 6...","[0, 40893, 41026, 44240, 67312, 67319, 67348, ..."
3,3,"[1, 10, 3, 13, 8, 3]","[0, 160, 173, 293, 738995, 739000]"
4,4,"[1, 3, 4, 13, 8, 5, 5, 16, 16]","[0, 49, 755, 756, 767, 784, 1661, 1661, 1913]"


In [7]:
if only_100_actions:
    file_name = "targetid_and_scaled_time_last100.pkl"
else:
    file_name = "targetid_and_scaled_time_all.pkl"


targetid_time_df.to_pickle(file_name, protocol=4) # protocol 4 allows this to work on google colab

targetid_time_df = pd.read_pickle(file_name)
print(targetid_time_df)

      USERID                                           TARGETID  \
0          0  [0, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 2, ...   
1          1  [10, 1, 2, 7, 0, 11, 12, 11, 12, 12, 12, 12, 5...   
2          2  [1, 10, 3, 13, 8, 7, 3, 13, 4, 15, 5, 16, 5, 6...   
3          3                               [1, 10, 3, 13, 8, 3]   
4          4                     [1, 3, 4, 13, 8, 5, 5, 16, 16]   
...      ...                                                ...   
7042    7042                                   [1, 3, 1, 4, 15]   
7043    7043  [1, 2, 1, 1, 10, 1, 10, 1, 2, 7, 3, 13, 4, 5, ...   
7044    7044                              [1, 2, 2, 1, 4, 4, 4]   
7045    7045               [1, 10, 10, 1, 2, 1, 3, 3, 4, 5, 16]   
7046    7046  [1, 2, 1, 1, 3, 8, 3, 4, 3, 5, 3, 25, 4, 3, 4, 5]   

                                       SCALED_TIMESTAMP  
0     [0, 6, 41, 49, 51, 55, 59, 62, 65, 113, 226, 9...  
1     [0, 4, 11, 27, 37, 57, 577, 584, 64802, 65203,...  
2     [0, 40893, 4102