In [1]:
import pandas as pd
import sqlite3

In [2]:
DB_name = '../data/checking-logs.sqlite'
conn = sqlite3.connect(DB_name)
schema_df = pd.io.sql.read_sql("PRAGMA table_info(test)",conn)
schema_df


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,uid,TEXT,0,,0
1,1,labname,TEXT,0,,0
2,2,first_commit_ts,TIMESTAMP,0,,0
3,3,first_view_ts,TIMESTAMP,0,,0


In [3]:
df_test = pd.read_sql("SELECT * FROM pageviews",conn)
df_test.head(10)

Unnamed: 0,index,uid,datetime
0,0,admin_1,2020-04-17 12:01:08.463179
1,1,admin_1,2020-04-17 12:01:23.743946
2,2,admin_3,2020-04-17 12:17:39.287778
3,3,admin_3,2020-04-17 12:17:40.001768
4,4,admin_1,2020-04-17 12:27:30.646665
5,5,admin_1,2020-04-17 12:35:44.884757
6,6,admin_1,2020-04-17 12:35:52.735016
7,7,admin_3,2020-04-17 12:36:21.401412
8,8,admin_3,2020-04-17 12:36:22.023355
9,9,admin_1,2020-04-17 13:55:19.129243


In [4]:

#  The deadline is stored as a Unix timestamp.
# julianday(...): Converts a datetime to a number representing the number of days since a fixed point in time (used for date math in SQLite).

query =  """

SELECT 
    test.uid,
    MIN((julianday(test.first_commit_ts) - julianday(deadlines.deadlines, 'unixepoch')) * 24) AS min_delta_hours
FROM
    (SELECT
        uid,
        labname,
        MIN(first_commit_ts) AS first_commit_ts
    FROM
        test
    WHERE
        labname != 'project1'
    GROUP BY
        uid, labname
) test
JOIN
    deadlines on test.labname = deadlines.labs
GROUP BY
    test.uid;
    
"""
# include users with no matching deadline (LEFT JOIN)

df_min = pd.read_sql(query,conn)
df_min

Unnamed: 0,uid,min_delta_hours
0,user_1,-175.556592
1,user_10,-132.341698
2,user_14,-200.766302
3,user_17,-81.591403
4,user_18,-10.973375
5,user_19,-148.916028
6,user_21,-126.199587
7,user_25,-150.869725
8,user_28,-174.852984
9,user_3,-182.055144


In [5]:
query = """
SELECT 
    test.uid,
    MAX((julianday(test.first_commit_ts) - julianday(deadlines.deadlines, 'unixepoch')) * 24) AS max_delta_hours
FROM (
        SELECT
            uid,
            labname,
            MAX(first_commit_ts) AS first_commit_ts
        FROM
            test
        WHERE 
            labname != 'project1'
        GROUP BY
            uid, labname
) test
JOIN
    deadlines on test.labname = deadlines.labs
GROUP BY
    test.uid

"""
df_max = pd.read_sql(query,conn)
df_max

Unnamed: 0,uid,max_delta_hours
0,user_1,-6.796432
1,user_10,-39.367888
2,user_14,-84.448466
3,user_17,-34.643043
4,user_18,-3.933907
5,user_19,-32.729282
6,user_21,-33.905274
7,user_25,-2.867236
8,user_28,-8.103915
9,user_3,-60.511392


In [6]:
query_avg = """
SELECT 
    AVG((julianday(test.first_commit_ts) - julianday(deadlines.deadlines, 'unixepoch')) * 24) AS avg_delta_hours
FROM (
        SELECT
            uid,
            labname,
            MAX(first_commit_ts) AS first_commit_ts
        FROM
            test
        WHERE 
            labname != 'project1'
        GROUP BY
            uid, labname
) test
JOIN
    deadlines ON test.labname = deadlines.labs
"""
df_avg = pd.read_sql(query_avg, conn)
df_avg


Unnamed: 0,avg_delta_hours
0,-89.687686


In [7]:
query = """
SELECT 
    AVG((julianday(test.first_commit_ts) - julianday(deadlines.deadlines, 'unixepoch')) * 24) AS avg_diff,
    pageviews.pageviews
FROM (
    SELECT 
        uid, 
        labname, 
        MIN(first_commit_ts) AS first_commit_ts
    FROM test
    WHERE labname != 'project1'
    GROUP BY uid, labname
) AS test
JOIN deadlines ON test.labname = deadlines.labs
JOIN (
    SELECT 
        uid, 
        COUNT(*) AS pageviews
    FROM pageviews
    GROUP BY uid
) AS pageviews ON test.uid = pageviews.uid
GROUP BY test.uid
"""

views_diff = pd.read_sql(query, conn)
views_diff


Unnamed: 0,avg_diff,pageviews
0,-65.119644,28
1,-75.24231,89
2,-159.568696,143
3,-62.207513,47
4,-6.367907,3
5,-99.440298,16
6,-96.111041,10
7,-93.474751,179
8,-86.793652,149
9,-105.738041,317


In [8]:
views_diff.corr()

Unnamed: 0,avg_diff,pageviews
avg_diff,1.0,-0.279143
pageviews,-0.279143,1.0


In [9]:
correlation = views_diff['pageviews'].corr(views_diff['avg_diff'])
print("correlation: ",correlation)

correlation:  -0.27914309109251906


In [10]:
conn.close()