In [75]:
import pandas as pd
import sqlite3 

In [76]:
connection = sqlite3.connect('../data/checking-logs.sqlite')

In [77]:
query = """
    WITH first_commits AS (
    SELECT 
        uid,
        labname,
        MIN(timestamp) AS first_commit_ts
    FROM checker
    WHERE 
        status = 'ready'
        AND numTrials = 1
        AND labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')
        AND uid LIKE 'user_%'
    GROUP BY uid, labname
    ),
    first_views AS (
        SELECT 
            uid,
            MIN(datetime) AS first_view_ts
        FROM pageviews
        WHERE uid LIKE 'user_%'
        GROUP BY uid
    )
    SELECT 
        fc.uid,
        fc.labname,
        fc.first_commit_ts,
        fv.first_view_ts
    FROM first_commits fc
    LEFT JOIN first_views fv ON fc.uid = fv.uid
"""
datamart = pd.io.sql.read_sql(query, connection, parse_dates=['first_commit_ts', 'first_view_ts'])
datamart

Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
2,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
3,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
4,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
...,...,...,...,...
135,user_8,laba04s,2020-04-19 10:22:35.761944,NaT
136,user_8,laba05,2020-05-02 13:28:07.705193,NaT
137,user_8,laba06,2020-05-16 17:56:15.755553,NaT
138,user_8,laba06s,2020-05-16 20:01:07.900727,NaT


In [78]:
test = datamart[datamart['first_view_ts'].notnull()]
control = datamart[datamart['first_view_ts'].isnull()]
control = control.fillna(test['first_view_ts'].mean())

In [79]:
test.to_sql('test', connection, if_exists='replace', index=False)
control.to_sql('control', connection, if_exists='replace', index=False)

81

In [80]:
connection.close()