In [1]:
%run helper/setup_notebook.ipynb import display_table

Successfully connected to leetcode50 database.


In [2]:
display_table('Activity')

+------------+------------+---------------+-----------+
| machine_id | process_id | activity_type | timestamp |
+------------+------------+---------------+-----------+
|     0      |     0      |     start     |   0.712   |
|     0      |     0      |      end      |    1.52   |
|     0      |     1      |     start     |    3.14   |
|     0      |     1      |      end      |    4.12   |
|     1      |     0      |     start     |    0.55   |
|     1      |     0      |      end      |    1.55   |
|     1      |     1      |     start     |    0.43   |
|     1      |     1      |      end      |    1.42   |
|     2      |     0      |     start     |    4.1    |
|     2      |     0      |      end      |   4.512   |
|     2      |     1      |     start     |    2.5    |
|     2      |     1      |      end      |    5.0    |
+------------+------------+---------------+-----------+


#### There is a factory website that has several machines each running the same number of processes. Write an SQL query to find the average time each machine takes to complete a process.

- *The time to complete a process is the `'end' timestamp` minus the `'start' timestamp`. The average time is calculated by the total time to complete every process on the machine divided by the number of processes that were run.*

- *The resulting table should have the `machine_id` along with the average time as `processing_time`, which should be rounded to 3 decimal places.*
```
+------------+-----------------+
| machine_id | processing_time |
+------------+-----------------+
| 0          | 0.894           |
| 1          | 0.995           |
| 2          | 1.456           |
+------------+-----------------+
Explanation: 
There are 3 machines running 2 processes each.
Machine 0's average time is ((1.520 - 0.712) + (4.120 - 3.140)) / 2 = 0.894
Machine 1's average time is ((1.550 - 0.550) + (1.420 - 0.430)) / 2 = 0.995
Machine 2's average time is ((4.512 - 4.100) + (5.000 - 2.500)) / 2 = 1.456
```

# Using JOIN

In [3]:
%%sql 

SELECT 
    start.machine_id,
    start.timestamp,
    end.timestamp
FROM activity start 
JOIN activity end 
    ON start.machine_id = end.machine_id 
    AND start.process_id = end.process_id

machine_id,timestamp,timestamp_1
0,1.52,0.712
0,0.712,0.712
0,1.52,1.52
0,0.712,1.52
0,4.12,3.14
0,3.14,3.14
0,4.12,4.12
0,3.14,4.12
1,1.55,0.55
1,0.55,0.55


In [4]:
%%sql 

SELECT 
    start.machine_id,
    start.timestamp,
    end.timestamp
FROM activity start 
JOIN activity end 
    ON start.machine_id = end.machine_id 
    AND start.process_id = end.process_id
    AND end.timestamp > start.timestamp

machine_id,timestamp,timestamp_1
0,0.712,1.52
0,3.14,4.12
1,0.55,1.55
1,0.43,1.42
2,4.1,4.512
2,2.5,5.0


In [5]:
%%sql 
SELECT 
    start.machine_id,
    AVG(end.timestamp - start.timestamp)
FROM activity start 
JOIN activity end
    ON start.machine_id = end.machine_id
    AND start.process_id = end.process_id
    AND end.timestamp > start.timestamp
GROUP BY start.machine_id

machine_id,AVG(end.timestamp - start.timestamp)
0,0.8939998745918274
1,0.9949999451637268
2,1.4560000896453855


In [6]:
%%sql 
SELECT 
    start.machine_id,
    ROUND(AVG(end.timestamp - start.timestamp), 3) AS processing_time
FROM activity start 
JOIN activity end
    ON start.machine_id = end.machine_id
    AND start.process_id = end.process_id
    AND end.timestamp > start.timestamp
GROUP BY start.machine_id

machine_id,processing_time
0,0.894
1,0.995
2,1.456


# Using CTE

In [7]:
%%sql 

SELECT machine_id, AVG(timestamp) AS avg_total
FROM activity
WHERE activity_type = 'end'
GROUP BY machine_id

machine_id,avg_total
0,2.819999933242798
1,1.48499995470047
2,4.75600004196167


In [8]:
%%sql 

SELECT machine_id, AVG(timestamp) AS avg_total 
FROM activity
WHERE activity_type = 'start'
GROUP BY machine_id

machine_id,avg_total
0,1.9260000586509705
1,0.4900000095367431
2,3.299999952316284


In [9]:
%%sql 

-- not ideal performance 
WITH end_avg_table AS (
    SELECT 
        machine_id, 
        AVG(timestamp) AS end_avg
    FROM activity
    WHERE activity_type = 'end'
    GROUP BY machine_id
), 
start_avg_table AS (
    SELECT 
        machine_id, 
        AVG(timestamp) AS start_avg 
    FROM activity
    WHERE activity_type = 'start'
    GROUP BY machine_id
)
SELECT 
    a.machine_id, 
    ROUND((end.end_avg - start.start_avg), 3) AS processing_time
FROM activity a
JOIN start_avg_table start ON start.machine_id = a.machine_id
JOIN end_avg_table end ON end.machine_id = a.machine_id
GROUP BY a.machine_id

machine_id,processing_time
0,0.894
1,0.995
2,1.456


In [10]:
%%sql 

-- better performance 

WITH end_sum_table AS (
    SELECT 
        machine_id, 
        SUM(timestamp) AS end_total
    FROM activity
    WHERE activity_type = 'end'
    GROUP BY machine_id
), 
start_sum_table AS (
    SELECT 
        machine_id, 
        SUM(timestamp) AS start_total 
    FROM activity
    WHERE activity_type = 'start'
    GROUP BY machine_id
)
SELECT 
    a.machine_id, 
    ROUND((end.end_total - start.start_total)/COUNT(DISTINCT process_id), 3) AS processing_time
FROM activity a
JOIN start_sum_table start ON start.machine_id = a.machine_id
JOIN end_sum_table end ON end.machine_id = a.machine_id
GROUP BY a.machine_id

machine_id,processing_time
0,0.894
1,0.995
2,1.456


# Using Pandas

In [11]:
import pandas as pd 

In [12]:
activity_query = %sql SELECT * FROM activity # type: ignore
activity_df = activity_query.DataFrame()
activity_df

Unnamed: 0,machine_id,process_id,activity_type,timestamp
0,0,0,start,0.712
1,0,0,end,1.52
2,0,1,start,3.14
3,0,1,end,4.12
4,1,0,start,0.55
5,1,0,end,1.55
6,1,1,start,0.43
7,1,1,end,1.42
8,2,0,start,4.1
9,2,0,end,4.512


## filter() 
### ***messy option***

In [13]:
activity_df.loc[(activity_df['activity_type'] == 'start')]

Unnamed: 0,machine_id,process_id,activity_type,timestamp
0,0,0,start,0.712
2,0,1,start,3.14
4,1,0,start,0.55
6,1,1,start,0.43
8,2,0,start,4.1
10,2,1,start,2.5


In [14]:
# notice how sum sums all colum values 
# this would result in error for mean() due to string values being present in select rows
activity_df.loc[(activity_df['activity_type'] == 'start')].groupby('machine_id').sum()

Unnamed: 0_level_0,process_id,activity_type,timestamp
machine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,startstart,3.852
1,1,startstart,0.98
2,1,startstart,6.6


In [15]:
# only select machine_id and timestamp
activity_df. \
    filter(items=['machine_id', 'timestamp']). \
    loc[(activity_df['activity_type'] == 'start')]. \
    groupby('machine_id'). \
    mean()

Unnamed: 0_level_0,timestamp
machine_id,Unnamed: 1_level_1
0,1.926
1,0.49
2,3.3


In [16]:
# there are much better/cleaner ways, but this works too! 

(
    activity_df. \
    filter(items=['machine_id', 'timestamp']). \
    loc[(activity_df['activity_type'] == 'end')]. \
    groupby('machine_id'). \
    mean()
) - (
    activity_df. \
    filter(items=['machine_id', 'timestamp']). \
    loc[(activity_df['activity_type'] == 'start')]. \
    groupby('machine_id'). \
    mean()
)

Unnamed: 0_level_0,timestamp
machine_id,Unnamed: 1_level_1
0,0.894
1,0.995
2,1.456


## query()

### ***better option***

In [17]:
activity_df.query("activity_type == 'start'").groupby('machine_id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10be80f50>

In [18]:
start = activity_df.query("activity_type == 'start'").groupby('machine_id').agg({'timestamp': 'mean'})
start 

Unnamed: 0_level_0,timestamp
machine_id,Unnamed: 1_level_1
0,1.926
1,0.49
2,3.3


In [19]:
end = activity_df.query("activity_type == 'end'").groupby('machine_id').agg({'timestamp': 'mean'})
end 

Unnamed: 0_level_0,timestamp
machine_id,Unnamed: 1_level_1
0,2.82
1,1.485
2,4.756


In [20]:
end - start 

Unnamed: 0_level_0,timestamp
machine_id,Unnamed: 1_level_1
0,0.894
1,0.995
2,1.456


## pivot()

### ***best option***

 Using the `pivot_table()` function to reshape the data.
- *Set the `machine_id` column as the index,* 
- *`activity_type` column as the columns,* 
- *`timestamp` column as the values* 
- *Specify the aggregation function `aggfunc='mean'` to calculate the mean timestamp for each combination of 'machine_id' and 'activity_type'.*

In [21]:
activity_df.pivot_table(
    index='machine_id',
    columns='activity_type',
    values='timestamp',
    aggfunc='mean'
)

activity_type,end,start
machine_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.82,1.926
1,1.485,0.49
2,4.756,3.3


In [24]:
pivot_df = activity_df.pivot_table(
    index='machine_id',
    columns='activity_type',
    values='timestamp',
    aggfunc='mean'
)
pivot_df['processing_time'] = pivot_df['end'] - pivot_df['start']
pivot_df

activity_type,end,start,processing_time
machine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.82,1.926,0.894
1,1.485,0.49,0.995
2,4.756,3.3,1.456


In [25]:
pivot_df[['processing_time']]

activity_type,processing_time
machine_id,Unnamed: 1_level_1
0,0.894
1,0.995
2,1.456
