## Импорт библиотек

In [1]:
import pandas as pd
import sqlite3

## Соединение с базой данных и схема таблицы

In [2]:
connection = sqlite3.connect('../data/checking-logs.sqlite')
query = "PRAGMA table_info(test);"
scheme = pd.read_sql(query, connection)
print(scheme)
first10_test = pd.read_sql("SELECT * FROM test LIMIT 10", connection)
first10_test

   cid             name       type  notnull dflt_value  pk
0    0              uid       TEXT        0       None   0
1    1          labname       TEXT        0       None   0
2    2  first_commit_ts  TIMESTAMP        0       None   0
3    3    first_view_ts  TIMESTAMP        0       None   0


Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,user_10,laba04,2020-04-25 08:24:52.696624,2020-04-18 12:19:50.182714
2,user_14,laba04,2020-04-18 15:14:00.312338,2020-04-18 10:53:52.623447
3,user_17,laba04,2020-04-23 14:24:29.947554,2020-04-18 10:56:55.833899
4,user_18,laba04,2020-04-26 19:48:11.822365,2020-04-26 22:49:29.243278
5,user_19,laba04,2020-04-20 19:05:01.297780,2020-04-21 20:30:38.034966
6,user_21,laba04,2020-04-21 17:48:00.487806,2020-04-22 22:40:36.824081
7,user_25,laba04,2020-04-20 19:16:50.673054,2020-05-09 23:54:54.260791
8,user_28,laba04,2020-04-22 21:47:19.707242,2020-05-10 21:07:50.350946
9,user_3,laba04,2020-04-23 20:29:14.054364,2020-05-08 10:53:47.123832


## Минимальное значение дельты между первым коммитом и дедлайном

In [3]:
min_delta_query = "SELECT ch.uid, ROUND((JULIANDAY(first_commit) - JULIANDAY(deadline_date)) * 24) AS hours_diff \
    FROM (SELECT uid, labname, first_commit_ts AS first_commit FROM test WHERE labname LIKE 'laba%' GROUP BY uid, labname) AS ch \
        JOIN (SELECT labs, DATETIME(deadlines, 'unixepoch') AS deadline_date FROM deadlines) AS dl ON ch.labname = dl.labs \
            GROUP BY uid, labname \
                ORDER BY hours_diff \
                    LIMIT 1"
df_min = pd.read_sql(min_delta_query, connection)
df_min

Unnamed: 0,uid,hours_diff
0,user_30,-202.0


## Максимальное значение дельты между первым коммитом и дедлайном

In [4]:
max_delta_query = "SELECT ch.uid, ROUND((JULIANDAY(first_commit) - JULIANDAY(deadline_date)) * 24) AS hours_diff \
    FROM (SELECT uid, labname, first_commit_ts AS first_commit FROM test WHERE labname LIKE 'laba%' GROUP BY uid, labname) AS ch \
        JOIN (SELECT labs, DATETIME(deadlines, 'unixepoch') AS deadline_date FROM deadlines) AS dl ON ch.labname = dl.labs \
            GROUP BY uid, labname \
                ORDER BY hours_diff DESC \
                    LIMIT 1"
df_max = pd.read_sql(max_delta_query, connection)
df_max

Unnamed: 0,uid,hours_diff
0,user_25,-3.0


## Среднее значение дельты между первым коммитом и дедлайном

In [5]:
avg_delta_query = "SELECT AVG(hours_diff) FROM (\
    SELECT ch.uid, ROUND((JULIANDAY(first_commit) - JULIANDAY(deadline_date)) * 24) AS hours_diff \
    FROM (SELECT uid, labname, first_commit_ts AS first_commit FROM test WHERE labname LIKE 'laba%' GROUP BY uid, labname) AS ch \
        JOIN (SELECT labs, DATETIME(deadlines, 'unixepoch') AS deadline_date FROM deadlines) AS dl ON ch.labname = dl.labs \
            GROUP BY uid, labname \
                )"
df_avg = pd.read_sql(avg_delta_query, connection)
df_avg

Unnamed: 0,AVG(hours_diff)
0,-89.729167


## Гипотеза

In [6]:
new_table_query = "CREATE TABLE IF NOT EXISTS views AS SELECT mn.uid, mn.hours_diff AS avg_diff, cnt.counter AS pageviews \
    FROM (SELECT ch.uid, AVG(ROUND((JULIANDAY(first_commit) - JULIANDAY(deadline_date)) * 24)) AS hours_diff \
        FROM (SELECT uid, labname, first_commit_ts AS first_commit FROM test WHERE labname LIKE 'laba%' GROUP BY uid, labname) AS ch \
            JOIN (SELECT labs, DATETIME(deadlines, 'unixepoch') AS deadline_date FROM deadlines) AS dl ON ch.labname = dl.labs GROUP BY uid) AS mn \
                JOIN ( SELECT uid, COUNT(datetime) AS counter FROM pageviews WHERE uid LIKE 'user_%' GROUP BY uid) AS cnt ON mn.uid = cnt.uid"
connection.execute(new_table_query)
views_diff = pd.read_sql("SELECT * FROM views", connection)
correlation = views_diff['avg_diff'].corr(views_diff['pageviews'])
correlation

-0.27977589529262725

## Закрытие соединения с базой данных

In [7]:
connection.close()