## Exercise 03 : Aggregations

Import libraries

In [1]:
import pandas as pd
import sqlite3

database_file = "../data/checking-logs.sqlite"

* create a connection to the database using the library sqlite3

In [2]:
connection_obj = sqlite3.connect(database_file)
cursor_obj = connection_obj.cursor()

* get the schema of the table test

In [3]:
test_schema = pd.io.sql.read_sql("PRAGMA table_info(test)", connection_obj)
test_schema

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,uid,TEXT,0,,0
2,2,labname,TEXT,0,,0
3,3,first_commit_ts,TIMESTAMP,0,,0
4,4,first_view_ts,TIMESTAMP,0,,0


* get only the first 10 rows of the table test to check what the table looks like

In [4]:
test_df = pd.read_sql(sql="SELECT * FROM test", con=connection_obj)
display(test_df.head(10))

Unnamed: 0,index,uid,labname,first_commit_ts,first_view_ts
0,0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,1,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
2,2,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
3,3,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
4,4,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
5,5,user_1,project1,2020-05-14 20:56:08.898880,2020-04-26 21:53:59.624136
6,6,user_10,laba04,2020-04-25 08:24:52.696624,2020-04-18 12:19:50.182714
7,7,user_10,laba04s,2020-04-25 08:37:54.604222,2020-04-18 12:19:50.182714
8,8,user_10,laba05,2020-05-01 19:27:26.063245,2020-04-18 12:19:50.182714
9,9,user_10,laba06,2020-05-19 11:39:28.885637,2020-04-18 12:19:50.182714


* find among all the users the minimum value of the delta between the first commit
of the user and the deadline of the corresponding lab using only one query
  * do this by joining the table with the table deadlines
  * the difference should be displayed in hours
  * do not take the lab ’project1’ into account, it has longer deadlines and will be
  an outlier
  * the value should be stored in the dataframe df_min with the corresponding
  uid

* do the same thing, but for the maximum, using only one query, the dataframe name is df_max

* do the same thing but for the average, using only one query, this time your dataframe should not include the uid column, and the dataframe name is df_avg

In [5]:
def get_aggregated_df(
    agg_func: str, col_name: str, con: object, include_uid_labname: bool = True
):

    select_fields = []
    if include_uid_labname:
        select_fields.extend(["t.uid", "t.labname"])
    select_fields.append(
        f"{agg_func}((strftime('%s', t.first_commit_ts) - d.deadlines) / 3600) AS {col_name}"
    )

    select_clause = ", ".join(select_fields)
    query = f"""
        SELECT {select_clause}
        FROM test t
        JOIN deadlines d ON t.labname = d.labs
        WHERE t.labname <> 'project1'
    """

    return pd.read_sql(sql=query, con=con)


df_min = get_aggregated_df(
    agg_func="MIN", col_name="min_delta_hours", con=connection_obj
)
df_max = get_aggregated_df(
    agg_func="MAX", col_name="max_delta_hours", con=connection_obj
)
df_avg = get_aggregated_df(
    agg_func="AVG",
    col_name="avg_delta_hours",
    con=connection_obj,
    include_uid_labname=False,
)

dfs = {"df_min": df_min, "df_max": df_max, "df_avg": df_avg}

for name, df in dfs.items():
    print(name)
    display(df)

df_min


Unnamed: 0,uid,labname,min_delta_hours
0,user_30,laba04,-202


df_max


Unnamed: 0,uid,labname,max_delta_hours
0,user_25,laba04s,-2


df_avg


Unnamed: 0,avg_delta_hours
0,-89.125


* we want to test the hypothesis that the users who visited the newsfeed just a few
times have the lower delta between the first commit and the deadline. To do this,
you need to calculate the correlation coefficient between the number of pageviews
and the difference
  * using only one query, create a table with the columns: uid, avg_diff, pageviews
  * uid is the uids that exist in the test
  * avg_diff is the average delta between the first commit and the lab deadline per user
  * pageviews is the number of Newsfeed visits per user
  * do not take the lab ’project1’ into account
  * store it to the dataframe views_diff
  * use the Pandas method corr() to calculate the correlation coefficient between
the number of pageviews and the difference

In [6]:
query = """
    WITH user_views AS (
        SELECT uid, COUNT(*) AS pageviews
        FROM pageviews
        GROUP BY uid
    )
    SELECT
        t.uid,
        AVG(strftime('%s', t.first_commit_ts) - d.deadlines) AS avg_diff, --UNIX timestamp delta in seconds
        uv.pageviews
        -- (SELECT COUNT(*) FROM pageviews p WHERE p.uid = t.uid) AS pageviews - example of subquery if used instead of WITH
    FROM test t
    JOIN deadlines d ON d.labs = t.labname
    JOIN user_views uv ON t.uid = uv.uid    
    WHERE
        t.labname <> 'project1'
    GROUP BY t.uid
"""

views_diff = pd.read_sql(sql=query, con=connection_obj, index_col="uid")
views_diff.corr()

Unnamed: 0,avg_diff,pageviews
avg_diff,1.0,-0.279143
pageviews,-0.279143,1.0


* close the connection

In [7]:
connection_obj.close()