In [1]:
import pandas as pd
import numpy as np
from pandasql import  sqldf


def make_select(query: str):
   return sqldf(query)

In [None]:
data = {
    'player_id': [1, 1, 2, 3, 3],
    'device_id': [2, 2, 3, 1, 4],
    'event_date': ['2016-03-01', '2016-05-02', '2017-06-25', '2016-03-02', '2018-07-03'],
    'games_played': [5, 6, 1, 0, 5]
}

# Converting to a pandas DataFrame
activity_df = pd.DataFrame(data)

# Converting 'event_date' column to datetime
activity_df['event_date'] = pd.to_datetime(activity_df['event_date'])

query = """
SELECT DISTINCT player_id, device_id FROM 
(SELECT 
player_id,device_id, MIN(event_date) AS min_date_login
FROM activity_df
GROUP BY player_id) AS sub_query
"""

make_select(query)

Unnamed: 0,player_id,device_id
0,1,2
1,2,3
2,3,1


In [20]:
sales_data = {
    'sale_id': [1, 2, 7],
    'product_id': [100, 100, 200],
    'year': [2008, 2009, 2011],
    'quantity': [10, 12, 15],
    'price': [5000, 5000, 9000]
}

# Creating the Product DataFrame
product_data = {
    'product_id': [100, 200, 300],
    'product_name': ['Nokia', 'Apple', 'Samsung']
}

# Converting to pandas DataFrames
sales_df = pd.DataFrame(sales_data)
product_df = pd.DataFrame(product_data)


query = """  
SELECT
s.product_id, 
MIN(year) AS first_year_sold,
s.quantity,
s.price
FROM sales_df s INNER JOIN product_df p ON s.product_id = p.product_id
GROUP BY s.product_id
"""

make_select(query)

Unnamed: 0,product_id,first_year_sold,quantity,price
0,100,2008,10,5000
1,200,2011,15,9000


In [25]:
project_data = {
    'project_id': [1, 1, 1, 2, 2],
    'employee_id': [1, 2, 3, 1, 4]
}

# Creating the Employee DataFrame
employee_data = {
    'employee_id': [1, 2, 3, 4],
    'name': ['Khaled', 'Ali', 'John', 'Doe'],
    'experience_years': [3, 2, 1, 2]
}

# Converting to pandas DataFrames
project_df = pd.DataFrame(project_data)
employee_df = pd.DataFrame(employee_data)

query = """  
SELECT project_id FROM (SELECT
p.project_id, 
COUNT(e.employee_id) AS total_employees
FROM project_df p INNER JOIN employee_df e ON p.employee_id = e.employee_id
GROUP BY p.project_id
ORDER BY total_employees DESC
LIMIT 1) AS sub_query
"""

make_select(query)

Unnamed: 0,project_id
0,1


In [34]:
project_data = {
    'project_id': [1, 1, 1, 2, 2],
    'employee_id': [1, 2, 3, 1, 4]
}

# Creating the Employee DataFrame
employee_data = {
    'employee_id': [1, 2, 3, 4],
    'name': ['Khaled', 'Ali', 'John', 'Doe'],
    'experience_years': [3, 2, 3, 2]
}

# Converting to pandas DataFrames
project_df = pd.DataFrame(project_data)
employee_df = pd.DataFrame(employee_data)

query = """  
SELECT 
project_id, employee_id
FROM 
(SELECT
*,
DENSE_RANK() OVER(PARTITION BY project_id ORDER BY e.experience_years DESC) AS ranking
FROM project_df p
INNER JOIN employee_df e 
ON p.employee_id = e.employee_id) AS sub_query
WHERE ranking = 1

"""

make_select(query)

Unnamed: 0,project_id,employee_id
0,1,1
1,1,3
2,2,1


In [44]:
actions_data = {
    'user_id': [1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5],
    'post_id': [1, 1, 1, 4, 4, 4, 4, 3, 3, 2, 2, 5],
    'action_date': [
        '2019-07-01', '2019-07-01', '2019-07-01',
        '2019-07-04', '2019-07-04', '2019-07-04', '2019-07-04',
        '2019-07-02', '2019-07-02', '2019-07-04', '2019-07-04', '2019-07-04'
    ],
    'action': ['view', 'like', 'share', 'view', 'report', 'view', 'report', 'view', 'report', 'view', 'report', 'view'],
    'extra': [None, None, None, None, 'spam', None, 'spam', None, 'spam', None, 'racism', 'racism']
}

# Converting to pandas DataFrame
actions_df = pd.DataFrame(actions_data)

# Converting 'action_date' column to datetime
actions_df['action_date'] = pd.to_datetime(actions_df['action_date'])

query = """  
SELECT 
	extra,
    COUNT(post_id) AS report_count
FROM actions_df
WHERE julianday('2019-07-05') - julianday(action_date) = 1
GROUP BY extra

"""

make_select(query)

Unnamed: 0,extra,report_count
0,,3
1,racism,2
2,spam,2


In [78]:
data = [['Dog', 'Golden Retriever', 1, 5], ['Dog', 'German Shepherd', 2, 5], ['Dog', 'Mule', 200, 1], ['Cat', 'Shirazi', 5, 2], ['Cat', 'Siamese', 3, 3], ['Cat', 'Sphynx', 7, 4]]
queries = pd.DataFrame(data, columns=['query_name', 'result', 'position', 'rating']).astype({'query_name':'object', 'result':'object', 'position':'Int64', 'rating':'Int64'})


query = """  
SELECT 
  query_name,
  ROUND(SUM(rating *1.0 /position) /COUNT(query_name) , 2)  AS quality,
  ROUND(COUNT(DISTINCT CASE WHEN rating < 3  THEN query_name END)  *1.0 / COUNT(query_name) * 100, 2) AS poor_query_percentage
FROM queries
WHERE query_name IS NOT NULL
GROUP BY query_namE

"""


make_select(query)

Unnamed: 0,query_name,quality,poor_query_percentage
0,Cat,0.66,33.33
1,Dog,2.5,33.33


In [None]:
activity_data = {
    'user_id': [1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4],
    'session_id': [1, 1, 1, 4, 4, 4, 2, 2, 2, 5, 5, 5, 3, 3],
    'activity_date': [
        '2019-07-20', '2019-07-20', '2019-07-20',
        '2019-07-20', '2019-07-21', '2019-07-21',
        '2019-07-21', '2019-07-21', '2019-07-21',
        '2019-07-21', '2019-07-21', '2019-07-21',
        '2019-06-25', '2019-06-25'
    ],
    'activity_type': [
        'open_session', 'scroll_down', 'end_session',
        'open_session', 'send_message', 'end_session',
        'open_session', 'send_message', 'end_session',
        'open_session', 'scroll_down', 'end_session',
        'open_session', 'end_session'
    ]
}

# Converting to pandas DataFrame
activity_df = pd.DataFrame(activity_data)

# Converting 'activity_date' column to datetime
activity_df['activity_date'] = pd.to_datetime(activity_df['activity_date'])

query = """  
SELECT  * FROM activity_df
WHERE   2019-07-27
"""

make_select(query)



Unnamed: 0,user_id,session_id,activity_date,activity_type
0,1,1,2019-07-20 00:00:00.000000,open_session
1,1,1,2019-07-20 00:00:00.000000,scroll_down
2,1,1,2019-07-20 00:00:00.000000,end_session
3,2,4,2019-07-20 00:00:00.000000,open_session
4,2,4,2019-07-21 00:00:00.000000,send_message
5,2,4,2019-07-21 00:00:00.000000,end_session
6,3,2,2019-07-21 00:00:00.000000,open_session
7,3,2,2019-07-21 00:00:00.000000,send_message
8,3,2,2019-07-21 00:00:00.000000,end_session
9,3,5,2019-07-21 00:00:00.000000,open_session


In [87]:

delivery = pd.DataFrame(data = {
    'delivery_id': [1, 2, 3, 4, 5, 6],
    'customer_id': [1, 5, 1, 3, 4, 2],
    'order_date': ['2019-08-01', '2019-08-02', '2019-08-11', '2019-08-24', '2019-08-21', '2019-08-11'],
    'customer_pref_delivery_date': ['2019-08-02', '2019-08-02', '2019-08-11', '2019-08-26', '2019-08-22', '2019-08-13']
})


delivery['order_date'] = pd.to_datetime(delivery['order_date'])
delivery['customer_pref_delivery_date'] = pd.to_datetime(delivery['customer_pref_delivery_date'])


query = """
  SELECT
  ROUND(COUNT(CASE WHEN delivery_type = 'immediate' THEN 1 END) * 1.0 /COUNT(*) * 100, 2) AS immediate_percentage
  FROM ( SELECT 
     *,
     CASE WHEN order_date = customer_pref_delivery_date THEN 'immediate' 
     ELSE 'scheduled' 
     END AS delivery_type
   FROM delivery) AS sub_query
"""

make_select(query)

Unnamed: 0,immediate_percentage
0,33.33


In [96]:
countries_data = {
    'country_id': [2, 3, 7, 5, 8, 9],
    'country_name': ['USA', 'Australia', 'Peru', 'China', 'Morocco', 'Spain']
}

# Creating the Countries DataFrame
countries_df = pd.DataFrame(countries_data)

# Data for the Weather table
weather_data = {
    'country_id': [2, 2, 2, 3, 3, 3, 5, 5, 5, 7, 7, 7, 8, 8, 8, 9, 9],
    'weather_state': [15, 12, 12, -2, 0, 3, 16, 18, 21, 25, 22, 20, 25, 27, 31, 7, 3],
    'day': ['2019-11-01', '2019-10-28', '2019-10-27', '2019-11-10', '2019-11-11', '2019-11-12',
            '2019-11-07', '2019-11-09', '2019-11-23', '2019-11-28', '2019-12-01', '2019-12-02',
            '2019-11-05', '2019-11-15', '2019-11-25', '2019-10-23', '2019-12-23']
}

# Creating the Weather DataFrame and converting the 'day' column to datetime
weather_df = pd.DataFrame(weather_data)
weather_df['day'] = pd.to_datetime(weather_df['day'])

query = """  
   
   SELECT country_name,
    CASE WHEN average_state <= 15 THEN 'cold'
         WHEN average_state>= 25 THEN 'hot' ELSE 'warm'
    END AS weather_type
    FROM (SELECT
   country_name,
   AVG(weather_state) AS average_state
   FROM countries_df c INNER JOIN weather_df w ON c.country_id = w.country_id
   WHERE day >= '2019-11-01' AND day <= '2019-11-30'
   GROUP BY country_name) AS sub_query
"""

make_select(query)

Unnamed: 0,country_name,weather_type
0,Australia,cold
1,China,warm
2,Morocco,hot
3,Peru,hot
4,USA,cold


In [101]:
employee_data = {
    'employee_id': [1, 2, 3, 4, 5, 6],
    'team_id': [8, 8, 8, 7, 9, 9]
}

# Creating the Employee DataFrame
employee_df = pd.DataFrame(employee_data)


query = """ 
  SELECT 
  employee_id, 
  COUNT(employee_id) OVER(PARTITION BY team_id) AS team_size
  FROM employee_df
  ORDER BY employee_id
"""
make_select(query)

Unnamed: 0,employee_id,team_size
0,1,3
1,2,3
2,3,3
3,4,1
4,5,2
5,6,2


In [119]:
ad_data = {
    'ad_id': [1, 2, 3, 5, 1, 2, 3, 1, 2, 1],
    'user_id': [1, 2, 3, 5, 7, 7, 5, 4, 11, 2],
    'action': ['Clicked', 'Clicked', 'Viewed', 'Ignored', 'Ignored', 'Viewed', 'Clicked', 'Viewed', 'Viewed', 'Clicked']
}

# Creating the Ad Interactions DataFrame
ads= pd.DataFrame(ad_data)


query = """  
 SELECT  
 ad_id,
ROUND( NULLIF(COUNT(
  CASE WHEN action = 'Clicked' THEN 1 END
 ), 0) * 1.0  /
NULLIF((COUNT(
  CASE WHEN action = 'Clicked' THEN 1 END
 ) +  
 COUNT(
  CASE WHEN action = 'Viewed' THEN 1 END
 )) , 0)* 100, 2) AS ctr
 FROM ads
 GROUP BY ad_id
"""

make_select(query)

Unnamed: 0,ad_id,ctr
0,1,66.67
1,2,33.33
2,3,50.0
3,5,


In [128]:
departments_data = {
    'id': [1, 7, 13],
    'name': ['Electrical Engineering', 'Computer Engineering', 'Business Administration']
}

# Creating the Departments DataFrame
departments_df = pd.DataFrame(departments_data)

# Data for the Students table
students_data = {
    'id': [23, 1, 5, 2, 4, 3, 6, 8, 7, 11],
    'name': ['Alice', 'Bob', 'Jennifer', 'John', 'Jasmine', 'Steve', 'Luis', 'Jonathan', 'Daiana', 'Madelynn'],
    'department_id': [1, 7, 13, 14, 77, 74, 1, 7, 33, 1]
}

# Creating the Students DataFrame
students_df = pd.DataFrame(students_data)

query = """  
  SELECT s.id, s.name
  FROM students_df s 
  LEFT JOIN departments_df d ON s.department_id = d.id
  WHERE d.id IS NULL
"""

make_select(query)

Unnamed: 0,id,name
0,2,John
1,4,Jasmine
2,3,Steve
3,7,Daiana


In [133]:
npv_data = {
    'id': [1, 7, 13, 1, 2, 3, 11, 7],
    'year': [2018, 2020, 2019, 2019, 2008, 2009, 2020, 2019],
    'npv': [100, 30, 40, 113, 121, 12, 99, 0]
}

# Creating the NPV DataFrame
npv_df = pd.DataFrame(npv_data)

# Data for the Queries table
queries_data = {
    'id': [1, 2, 3, 7, 7, 7, 13],
    'year': [2019, 2008, 2009, 2018, 2019, 2020, 2019]
}

# Creating the Queries DataFrame
queries_df = pd.DataFrame(queries_data)


query = """  
SELECT q.id, q.year, n.npv FROM npv_df n INNER JOIN queries_df q  ON n.id = q.id AND n.year = q.year
"""

make_select(query)

Unnamed: 0,id,year,npv
0,7,2020,30
1,13,2019,40
2,1,2019,113
3,2,2008,121
4,3,2009,12
5,7,2019,0


In [139]:
tvprogram_data = {
    'program_date': ['2020-06-10 08:00', '2020-05-11 12:00', '2020-05-12 12:00', 
                     '2020-05-13 14:00', '2020-06-18 14:00', '2020-07-15 16:00'],
    'content_id': [1, 2, 3, 4, 4, 5],
    'channel': ['LC-Channel', 'LC-Channel', 'LC-Channel', 'Disney Ch', 'Disney Ch', 'Disney Ch']
}

# Creating the TVProgram DataFrame and converting 'program_date' to datetime
tvprogram_df = pd.DataFrame(tvprogram_data)
tvprogram_df['program_date'] = pd.to_datetime(tvprogram_df['program_date'])

# Data for the Content table
content_data = {
    'content_id': [1, 2, 3, 4, 5],
    'title': ['Leetcode Movie', 'Alg. for Kids', 'Database Sols', 'Aladdin', 'Cinderella'],
    'Kids_content': ['N', 'Y', 'N', 'Y', 'Y'],
    'content_type': ['Movies', 'Series', 'Series', 'Movies', 'Movies']
}

# Creating the Content DataFrame
content_df = pd.DataFrame(content_data)


query = """  
 SELECT 
 DISTINCT c.title
 FROM content_df c 
   INNER JOIN tvprogram_df t ON c.content_id = t.content_id
  WHERE t.program_date >= '2020-06-01' AND t.program_date <= '2020-06-30' AND c.kids_content = 'Y'
"""

make_select(query)

Unnamed: 0,title
0,Aladdin


In [155]:
customers_df = pd.DataFrame({
    'customer_id': [1, 2, 3],
    'name': ['Winston', 'Jonathan', 'Moustafa'],
    'country': ['USA', 'Peru', 'Egypt']
})

product_df = pd.DataFrame({
    'product_id': [10, 20, 30, 40],
    'description': ['LC Phone', 'LC T-Shirt', 'LC Book', 'LC Keychain'],
    'price': [300, 10, 45, 2]
})

orders_df = pd.DataFrame({
    'order_id': [1, 2, 3, 4, 5, 6, 7, 9],
    'customer_id': [1, 1, 1, 2, 2, 3, 3, 3],
    'product_id': [10, 20, 30, 10, 40, 20, 30, 30],
    'order_date': ['2020-06-10', '2020-07-01', '2020-07-08', '2020-06-15', '2020-07-01', 
                   '2020-06-24', '2020-06-25', '2020-05-08'],
    'quantity': [1, 1, 2, 2, 10, 2, 2, 3]
})

orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])


query = """   
 SELECT customer_id, name FROM ( SELECT 
  o.product_id,c.customer_id, c.name, o.order_date, o.quantity, p.price,
  SUM(o.quantity* p.price) AS total_spent
    FROM customers_df c
   INNER JOIN orders_df o ON o.customer_id = c.customer_id
   INNER JOIN product_df p ON p.product_id = o.product_id
   WHERE o.order_date BETWEEN '2020-07-01' AND '2020-08-31' 
   GROUP BY c.customer_id,c.name) AS sub_query
 WHERE total_spent >= 100
"""


make_select(query)


Unnamed: 0,customer_id,name
0,1,Winston


In [168]:
orders_data = {
    'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'order_date': ['2020-09-15', '2020-09-17', '2020-10-06', '2020-10-20', 
                   '2020-11-10', '2020-11-21', '2020-12-01', '2020-12-03', 
                   '2021-01-07', '2021-01-15'],
    'customer_id': [1, 2, 3, 3, 1, 2, 4, 4, 3, 2],
    'invoice': [30, 90, 20, 21, 10, 15, 55, 77, 31, 20]
}


orders_df = pd.DataFrame(orders_data)
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])

query = """  
  SELECT 
  strftime('%Y-%m', order_date) AS year_month, 
  COUNT(DISTINCT order_id) AS order_count,
  COUNT(DISTINCT customer_id) AS customer_count
  FROM orders_df
  WHERE invoice > 20
  GROUP  BY year_month
"""

make_select(query)

Unnamed: 0,year_month,order_count,customer_count
0,2020-09,2,2
1,2020-10,1,1
2,2020-12,2,1
3,2021-01,1,1


In [180]:
warehouse_data = {
    'name': ['LCHouse1', 'LCHouse1', 'LCHouse1', 'LCHouse2', 'LCHouse2', 'LCHouse3'],
    'product_id': [1, 2, 3, 1, 2, 4],
    'units': [1, 10, 5, 2, 2, 1]
}

# Creating the Warehouse DataFrame
warehouse_df = pd.DataFrame(warehouse_data)

# Data for the Products table
products_data = {
    'product_id': [1, 2, 3, 4],
    'product_name': ['LC-TV', 'LC-KeyChain', 'LC-Phone', 'LC-T-Shirt'],
    'Width': [5, 5, 2, 4],
    'Length': [50, 5, 10, 10],
    'Height': [40, 5, 10, 20]
}

# Creating the Products DataFrame
products_df = pd.DataFrame(products_data)


query = """  
SELECT 
name,
SUM(volume) AS volume 
FROM (SELECT 
*,
w.units * p.Width * p.Length * p.height AS volume
FROM warehouse_df w 
INNER JOIN products_df p ON w.product_id = p.product_id) AS sub_query
GROUP BY name
"""

make_select(query)

Unnamed: 0,name,volume
0,LCHouse1,12250
1,LCHouse2,20250
2,LCHouse3,800


In [None]:
customer_df = pd.DataFrame({
    'customer_id': [101, 102, 103],
    'customer_name': ['Alice', 'Bob', 'Charlie']
})

orders_df = pd.DataFrame({
    'order_id': [1, 2, 3, 4, 5],
    'sale_date': ['2020-03-01', '2020-05-25', '2019-05-25', '2020-09-13', '2019-02-11'],
    'order_cost': [1500, 2400, 800, 1000, 700],
    'customer_id': [101, 102, 101, 103, 101],
    'seller_id': [1, 2, 3, 2, 2]
})

seller_df = pd.DataFrame({
    'seller_id': [1, 2, 3],
    'seller_name': ['Daniel', 'Elizabeth', 'Frank']
})

query = """  

   WITH made_orders AS (SELECT s.seller_name, s.seller_id, o.order_cost, o.sale_date FROM customer_df c 
   INNER JOIN orders_df o ON c.customer_id = o.customer_id 
   INNER JOIN seller_df s ON s.seller_id = o.seller_id
   WHERE sale_date BETWEEN '2020-01-01' AND '2020-12-31')

   SELECT s.seller_name FROM seller_df s LEFT JOIN made_orders m ON m.seller_id = s.seller_id  
   WHERE m.seller_id IS NULL

"""

make_select(query)

Unnamed: 0,seller_name
0,Frank


In [218]:
data = [[1, 'S8', 1000], [2, 'G4', 800], [3, 'iPhone', 1400]]
product = pd.DataFrame(data, columns=['product_id', 'product_name', 'unit_price']).astype({'product_id':'Int64', 'product_name':'object', 'unit_price':'Int64'})
data = [[1, 1, 1, '2019-01-21', 2, 2000], [1, 2, 2, '2019-02-17', 1, 800], [2, 2, 3, '2019-06-02', 1, 800], [3, 3, 4, '2019-05-13', 2, 2800]]
sales = pd.DataFrame(data, columns=['seller_id', 'product_id', 'buyer_id', 'sale_date', 'quantity', 'price']).astype({'seller_id':'Int64', 'product_id':'Int64', 'buyer_id':'Int64', 'sale_date':'datetime64[ns]', 'quantity':'Int64', 'price':'Int64'})


query = """  

WITH cte AS (SELECT 
p.product_id, p.product_name, s.sale_date
FROM product p INNER JOIN sales s ON p.product_id = s.product_id),

sold_products AS (SELECT * FROM cte
WHERE sale_date >= '2019-01-01' AND sale_date <= '2019-03-31'),

not_sold_products AS (SELECT * FROM cte
WHERE sale_date NOT BETWEEN '2019-01-01' AND '2019-03-31')


SELECT s.product_id, s.product_name FROM sold_products s LEFT JOIN not_sold_products sp ON s.product_id = sp.product_id WHERE sp.product_id IS NULL

"""

make_select(query)

Unnamed: 0,product_id,product_name
0,1,S8


In [None]:
playback_df = pd.DataFrame({
    'session_id': [1, 2, 3, 4, 5],
    'customer_id': [1, 1, 2, 2, 2],
    'start_time': [1, 15, 10, 17, 2],
    'end_time': [5, 23, 12, 28, 8]
})

ads_df = pd.DataFrame({
    'ad_id': [1, 2, 3],
    'customer_id': [1, 2, 2],
    'timestamp': [5, 17, 20]
})


query = """  

  SELECT 
     p.session_id, p.customer_id, p.start_time, p.end_time,a.ad_id, a.timestamp 
  FROM playback_df p 
    INNER JOIN ads_df a 
  ON p.customer_id = a.customer_id

"""

make_select(query)

Unnamed: 0,session_id,customer_id,start_time,end_time,ad_id,timestamp
0,1,1,1,5,1,5
1,2,1,15,23,1,5
2,3,2,10,12,2,17
3,3,2,10,12,3,20
4,4,2,17,28,2,17
5,4,2,17,28,3,20
6,5,2,2,8,2,17
7,5,2,2,8,3,20


In [226]:
customers_data = {
    'customer_id': [1, 1, 1, 2, 3, 3, 4],
    'year': [2018, 2021, 2020, 2021, 2018, 2016, 2021],
    'revenue': [50, 30, 70, -50, 10, 50, 20]
}
customers_df = pd.DataFrame(customers_data)

query = """  
SELECT customer_id FROM customers_df WHERE year = 2021 AND revenue > 0
"""

make_select(query)

Unnamed: 0,customer_id
0,1
1,4


In [267]:
signups_df = pd.DataFrame({
    'user_id': [3, 7, 2, 6],
    'time_stamp': [
        '2020-03-21 10:16:13',
        '2020-01-04 13:57:59',
        '2020-07-29 23:09:44',
        '2020-12-09 10:39:37'
    ]
})

confirmations_df = pd.DataFrame( {
    'user_id': [3, 3, 7, 7, 2, 2, 6, 6],
    'time_stamp': [
        '2021-01-06 03:30:46',
        '2021-01-06 03:37:45',
        '2021-06-12 11:57:29',
        '2021-06-13 11:57:30',
        '2021-01-22 00:00:00',
        '2021-01-23 00:00:00',
        '2021-10-23 14:14:14',
        '2021-10-24 14:14:13'
    ],
    'action': ['timeout', 'timeout', 'confirmed', 'confirmed', 'confirmed', 'timeout', 'confirmed', 'timeout']
})


query = """  
SELECT DISTINCT c1.user_id
FROM confirmations_df c1
JOIN confirmations_df c2 
    ON c1.user_id = c2.user_id
    AND c1.time_stamp < c2.time_stamp 
    AND c2.time_stamp <= datetime(c1.time_stamp, '+24 hours');
"""

make_select(query)

Unnamed: 0,user_id
0,3
1,2
2,6


In [274]:
candidates_df = pd.DataFrame({
    'candidate_id': [123, 234, 123, 123, 234, 234, 147, 147, 147, 147, 256, 102],
    'skill': [
        'Python', 'R', 'Tableau', 'PostgreSQL', 'PowerBI', 'SQL Server',
        'Python', 'Tableau', 'Java', 'PostgreSQL', 'Tableau', 'DataAnalysis'
    ]
})


query = """  
SELECT candidate_id
FROM candidates_df 
WHERE skill IN ('Python', 'Tableau', 'PostgreSQL')
GROUP BY candidate_id
HAVING COUNT(DISTINCT skill) = 3
"""

make_select(query)

Unnamed: 0,candidate_id
0,123
1,147


In [298]:
activity_df = pd.DataFrame({
    'player_id': [1, 1, 1, 3, 3],
    'device_id': [2, 2, 3, 1, 4],
    'event_date': ['2016-03-01', '2016-05-02', '2017-06-25', '2016-03-02', '2018-07-03'],
    'games_played': [5, 6, 1, 0, 5]
})


activity_df['event_date'] = pd.to_datetime(activity_df['event_date'])


query = """  
SELECT 
player_id, event_date,
SUM(games_played) OVER(PARTITION BY player_id ORDER BY event_date) AS games_played_so_far
FROM activity_df
ORDER BY player_id, event_date

"""

make_select(query)

Unnamed: 0,player_id,event_date,games_played_so_far
0,1,2016-03-01 00:00:00.000000,5
1,1,2016-05-02 00:00:00.000000,11
2,1,2017-06-25 00:00:00.000000,12
3,3,2016-03-02 00:00:00.000000,0
4,3,2018-07-03 00:00:00.000000,5


In [None]:
candidates_df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['A', 'B', 'C', 'D', 'E']
})

# Create the Vote DataFrame
votes_df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'candidateId': [2, 4, 3, 2, 5]
})

query = """  

  WITH cte AS ( SELECT 
 c.id, c.name, 
 COUNT(v.id) AS total_votes
 FROM candidates_df c INNER JOIN votes_df v ON c.id = v.candidateId
 GROUP BY c.id, c.name
 ORDER BY total_votes DESC)


 SELECT name FROM  cte WHERE total_votes = (SELECT MAX(total_votes) FROM cte)
 
"""

make_select(query)

Unnamed: 0,name
0,B


In [None]:
salaries = pd.DataFrame({
    'employee_id': [1, 1, 1, 2, 2, 2, 3, 3, 3],
    'salary': [50000, 52000, 54000, 55000, 56000, 58000, 47000, 48000, 49000],
    'year': [2020, 2021, 2022, 2020, 2021, 2022, 2020, 2021, 2022]
})

query = """

  SELECT 
  *, 
  AVG(salary) 
     OVER(PARTITION BY employee_id ORDER BY year ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS avg_salary
  FROM salaries

"""

make_select(query)

Unnamed: 0,employee_id,salary,year,avg_salary
0,1,50000,2020,50000.0
1,1,52000,2021,51000.0
2,1,54000,2022,52000.0
3,2,55000,2020,55000.0
4,2,56000,2021,55500.0
5,2,58000,2022,56333.333333
6,3,47000,2020,47000.0
7,3,48000,2021,47500.0
8,3,49000,2022,48000.0


In [339]:
data = [[1, 2, '2016/06/03'], [1, 3, '2016/06/08'], [2, 3, '2016/06/08'], [3, 4, '2016/06/09']]
request_accepted = pd.DataFrame(data, columns=['requester_id', 'accepter_id', 'accept_date']).astype({'requester_id':'Int64', 'accepter_id':'Int64', 'accept_date':'datetime64[ns]'})

query  = """  
WITH cte AS (SELECT 
accepter_id, 
COUNT(requester_id) AS num
FROM request_accepted
GROUP BY accepter_id
ORDER BY num DESC ),

cte2  AS (SELECT 
requester_id, 
COUNT(accepter_id) AS num
FROM request_accepted
GROUP BY requester_id
ORDER BY num DESC ),

cte3 AS (SELECT * FROM cte
UNION ALL
SELECT * FROM cte2)

SELECT 
accepter_id  AS id,
SUM(num) AS num
FROM cte3
GROUP BY accepter_id
ORDER BY num DESC
LIMIT 1
"""

make_select(query)

Unnamed: 0,id,num
0,3,3


In [362]:
data = [[1, 'Jhon', '2019-01-01', 100], [2, 'Daniel', '2019-01-02', 110], [3, 'Jade', '2019-01-03', 120], [4, 'Khaled', '2019-01-04', 130], [5, 'Winston', '2019-01-05', 110], [6, 'Elvis', '2019-01-06', 140], [7, 'Anna', '2019-01-07', 150], [8, 'Maria', '2019-01-08', 80], [9, 'Jaze', '2019-01-09', 110], [1, 'Jhon', '2019-01-10', 130], [3, 'Jade', '2019-01-10', 150]]
customer = pd.DataFrame(data, columns=['customer_id', 'name', 'visited_on', 'amount']).astype({'customer_id':'Int64', 'name':'object', 'visited_on':'datetime64[ns]', 'amount':'Int64'})


query = """

WITH cte AS (SELECT 
visited_on,
SUM(total_amount) OVER(ORDER BY visited_on ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS amount,
AVG(total_amount) OVER(ORDER BY visited_on ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS avg_amount,
day_number
FROM (SELECT 
visited_on, 
SUM(amount) AS total_amount,
ROW_NUMBER() OVER (ORDER BY visited_on) AS day_number
FROM customer
GROUP BY visited_on
ORDER BY visited_on) AS sub_query)


SELECT 
visited_on, amount, ROUND(avg_amount,2) AS average_amount
FROM cte
WHERE day_number >= 7
"""

make_select(query)

Unnamed: 0,visited_on,amount,average_amount
0,2019-01-07 00:00:00.000000,860,122.86
1,2019-01-08 00:00:00.000000,840,120.0
2,2019-01-09 00:00:00.000000,840,120.0
3,2019-01-10 00:00:00.000000,1000,142.86


In [None]:
data = [[1, '2015-01-01', 10], [2, '2015-01-02', 25], [3, '2015-01-03', 20], [4, '2015-01-04', 30]]
weather = pd.DataFrame(data, columns=['id', 'recordDate', 'temperature']).astype({'id':'Int64', 'recordDate':'datetime64[ns]', 'temperature':'Int64'})

query = """  
SELECT * FROM (SELECT
*,
LAG(temperature, 1) OVER(ORDER BY recordDate) AS prev_temperature,
LAG(recordDate) OVER(ORDER BY recordDate) AS prev_date
FROM weather)
WHERE temperature > prev_temperature AND DATE_DIFF(recordDate, prev_date) = 1
"""

make_select(query)

Unnamed: 0,id,recordDate,temperature,prev_temperature,prev_date
0,2,2015-01-02 00:00:00.000000,25,10,2015-01-01 00:00:00.000000
1,4,2015-01-04 00:00:00.000000,30,20,2015-01-03 00:00:00.000000


In [None]:
data = [[1, 2, '2016-03-01', 5], [1, 2, '2016-03-02', 6], [2, 3, '2017-06-25', 1], [3, 1, '2016-03-02', 0], [3, 4, '2018-07-03', 5]]
activity = pd.DataFrame(data, columns=['player_id', 'device_id', 'event_date', 'games_played']).astype({'player_id':'Int64', 'device_id':'Int64', 'event_date':'datetime64[ns]', 'games_played':'Int64'})


query = """  


"""



make_select(query)

Unnamed: 0,player_id,device_id,event_date,games_played,nn,first_day_login
0,1,2,2016-03-01 00:00:00.000000,5,,2016-03-01 00:00:00.000000
1,1,2,2016-03-02 00:00:00.000000,6,2016-03-01 00:00:00.000000,2016-03-01 00:00:00.000000
2,2,3,2017-06-25 00:00:00.000000,1,,2017-06-25 00:00:00.000000
3,3,1,2016-03-02 00:00:00.000000,0,,2016-03-02 00:00:00.000000
4,3,4,2018-07-03 00:00:00.000000,5,2016-03-02 00:00:00.000000,2016-03-02 00:00:00.000000


In [48]:
data = [[1, 1, '2019-08-01', '2019-08-02'], [2, 2, '2019-08-02', '2019-08-02'], [3, 1, '2019-08-11', '2019-08-12'], [4, 3, '2019-08-24', '2019-08-24'], [5, 3, '2019-08-21', '2019-08-22'], [6, 2, '2019-08-11', '2019-08-13'], [7, 4, '2019-08-09', '2019-08-09']]
delivery = pd.DataFrame(data, columns=['delivery_id', 'customer_id', 'order_date', 'customer_pref_delivery_date']).astype({'delivery_id':'Int64', 'customer_id':'Int64', 'order_date':'datetime64[ns]', 'customer_pref_delivery_date':'datetime64[ns]'})
query = """  
WITH cte AS (SELECT 
*,
CASE
 WHEN order_date = customer_pref_delivery_date THEN 'immediate' 
 ELSE 'scheduled'
END AS delivery_type 
FROM delivery),

orders AS (SELECT 
customer_id,
MIN(order_date) AS firt_order
FROM cte
GROUP BY customer_id),

cte2 AS (SELECT
o.customer_id, o.firt_order, c.delivery_type
FROM orders o  INNER JOIN cte c ON o.customer_id = c.customer_id AND o.firt_order = c.order_date)


SELECT
ROUND(COUNT(CASE WHEN delivery_type = 'immediate' THEN 1 END) * 1.0 / COUNT(delivery_type) * 100, 2) AS immediate_percentage
FROM cte2
"""



make_select(query)

Unnamed: 0,immediate_percentage
0,50.0


In [60]:
project_df = pd.DataFrame({
    'project_id': [1, 1, 1, 2, 2],
    'employee_id': [1, 2, 3, 1, 4]
}
)

employee_df = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['Khaled', 'Ali', 'John', 'Doe'],
    'experience_years': [3, 2, 3, 2]
}
)

query = """  
WITH cte AS (SELECT 
p.project_id, e.employee_id,e.experience_years,
DENSE_RANK() OVER(PARTITION BY project_id ORDER BY experience_years DESC) AS ranking
FROM project_df p INNER JOIN employee_df e ON p.employee_id = e.employee_id )

SELECT project_id, employee_id FROM cte WHERE ranking = 1
"""

make_select(query)

Unnamed: 0,project_id,employee_id
0,1,1
1,1,3
2,2,1


In [None]:
books_df = pd.DataFrame({
    'book_id': [1, 2, 3, 4, 5],
    'name': ["Kalila And Demna", "28 Letters", "The Hobbit", "13 Reasons Why", "The Hunger Games"],
    'available_from': ['2010-01-01', '2012-05-12', '2019-06-10', '2019-06-01', '2008-09-21']
}
)

orders_df = pd.DataFrame({
    'order_id': [1, 2, 3, 4, 5, 6, 7],
    'book_id': [1, 1, 3, 4, 4, 5, 5],
    'quantity': [2, 1, 8, 6, 5, 9, 8],
    'dispatch_date': ['2018-07-26', '2018-11-05', '2019-06-11', '2019-06-05', '2019-06-20', '2009-02-02', '2010-04-13']
})


books_df['available_from'] = pd.to_datetime(books_df['available_from'])
orders_df['dispatch_date'] = pd.to_datetime(orders_df['dispatch_date'])

# 2019-06-23

query = """  

WITH cte AS ( SELECT 
 b.book_id, b.name,
 SUM(o.quantity) AS total_sold
 FROM books_df b INNER JOIN orders_df o ON b.book_id = o.book_id
 WHERE strftime('%Y', o.dispatch_date) <= '2018'
 GROUP BY  b.book_id, b.name
 HAVING SUM(o.quantity) < 10)


 SELECT 
 *
 FROM books_df b INNER JOIN orders_df o ON b.book_id = o.book_id 
 WHERE b.available_from < '2019-06-23'


"""

make_select(query)

Unnamed: 0,book_id,name,available_from,order_id,book_id.1,quantity,dispatch_date
0,1,Kalila And Demna,2010-01-01 00:00:00.000000,1,1,2,2018-07-26 00:00:00.000000
1,1,Kalila And Demna,2010-01-01 00:00:00.000000,2,1,1,2018-11-05 00:00:00.000000
2,3,The Hobbit,2019-06-10 00:00:00.000000,3,3,8,2019-06-11 00:00:00.000000
3,4,13 Reasons Why,2019-06-01 00:00:00.000000,4,4,6,2019-06-05 00:00:00.000000
4,4,13 Reasons Why,2019-06-01 00:00:00.000000,5,4,5,2019-06-20 00:00:00.000000
5,5,The Hunger Games,2008-09-21 00:00:00.000000,6,5,9,2009-02-02 00:00:00.000000
6,5,The Hunger Games,2008-09-21 00:00:00.000000,7,5,8,2010-04-13 00:00:00.000000


In [107]:
activities_df = pd.DataFrame({
    'activity_id': [7274, 2425, 1413, 2536, 8564, 5235, 4251, 1435],
    'user_id': [123, 123, 456, 456, 456, 789, 123, 789],
    'activity_type': ['open', 'send', 'send', 'open', 'send', 'send', 'open', 'open'],
    'time_spent': [4.50, 3.50, 5.67, 3.00, 8.24, 6.24, 1.25, 5.25]
})

age_df = pd.DataFrame({
    'user_id': [123, 789, 456],
    'age_bucket': ['31-35', '21-25', '26-30']
})

query = """  
WITH cte AS (
SELECT 
a.activity_id, a.user_id, a.activity_type, a.time_spent, ad.age_bucket
FROM activities_df a 
INNER JOIN age_df ad ON a.user_id = ad.user_id
),

result AS (SELECT 
*,
ROUND(SUM( CASE WHEN activity_type='send' THEN time_spent END ) OVER(PARTITION BY age_bucket) * 1.0 /
SUM(time_spent) OVER(PARTITION BY age_bucket) * 100, 2) AS send_perc,
ROUND(SUM( CASE WHEN activity_type='open' THEN time_spent END ) OVER(PARTITION BY age_bucket) * 1.0 /
SUM(time_spent) OVER(PARTITION BY age_bucket) * 100, 2) AS open_perc
FROM cte)

SELECT 
age_bucket, send_perc, open_perc
FROM result
GROUP BY age_bucket, send_perc, open_perc
ORDER BY age_bucket DESC
"""

make_select(query)

Unnamed: 0,age_bucket,send_perc,open_perc
0,31-35,37.84,62.16
1,26-30,82.26,17.74
2,21-25,54.31,45.69


In [122]:
purchases_df = pd.DataFrame({
    'user_id': [11, 15, 17, 12, 8, 1, 10, 13],
    'purchase_date': ['2023-11-07', '2023-11-30', '2023-11-14', '2023-11-24', 
                      '2023-11-03', '2023-11-16', '2023-11-12', '2023-11-24'],
    'amount_spend': [1126, 7473, 2414, 9692, 5117, 5241, 8266, 12000]
}
)

purchases_df['purchase_date'] = pd.to_datetime(purchases_df['purchase_date'])

query = """  

WITH cte AS (SELECT 
*, 
strftime('%w', purchase_date) AS week,
(CAST(strftime('%d', purchase_date) AS INTEGER) + strftime('%w', DATE(purchase_date, '-1 days', 'start of month')) - 1) / 7 + 1 AS week_of_month
FROM purchases_df)


SELECT
week_of_month, 
week,
purchase_date,
SUM(amount_spend) AS total_amount
FROM cte
WHERE week = '5'
GROUP BY week, purchase_date, week_of_month 

"""

make_select(query)

Unnamed: 0,week_of_month,week,purchase_date,total_amount
0,1,5,2023-11-03 00:00:00.000000,5117
1,4,5,2023-11-24 00:00:00.000000,21692


In [24]:
data = [
    {"person_id": 6, "person_name": "luffy", "weight": 5, "turn": 4},
    {"person_id": 14, "person_name": "ace", "weight": 16, "turn": 10},
    {"person_id": 15, "person_name": "sabo", "weight": 9, "turn": 14},
    {"person_id": 7, "person_name": "zoro", "weight": 11, "turn": 3},
    {"person_id": 9, "person_name": "sanji", "weight": 10, "turn": 7},
    {"person_id": 4, "person_name": "nami", "weight": 3, "turn": 11},
    {"person_id": 3, "person_name": "ussop", "weight": 16, "turn": 15},
    {"person_id": 5, "person_name": "chopper", "weight": 2, "turn": 2},
    {"person_id": 12, "person_name": "brooke", "weight": 11, "turn": 5},
    {"person_id": 2, "person_name": "robin", "weight": 19, "turn": 12},
    {"person_id": 13, "person_name": "franky", "weight": 17, "turn": 1},
    {"person_id": 11, "person_name": "shanks", "weight": 15, "turn": 6},
    {"person_id": 1, "person_name": "kaido", "weight": 22, "turn": 8},
    {"person_id": 8, "person_name": "edward", "weight": 2, "turn": 9},
    {"person_id": 16, "person_name": "linlin", "weight": 6, "turn": 13},
    {"person_id": 17, "person_name": "teach", "weight": 7, "turn": 16},
    {"person_id": 10, "person_name": "dragon", "weight": 6, "turn": 17},
]

# Create the DataFrame
queue_df = pd.DataFrame(data)


query = """  
WITH cte AS (SELECT 
*,
SUM(weight) OVER(ORDER BY turn) AS cum_sum
FROM queue_df),

cte2 AS (SELECT 
*
FROM cte
WHERE cum_sum <= 1000)

SELECT person_name FROM cte2 ORDER BY turn DESC LIMIT 1
"""

make_select(query)

Unnamed: 0,person_name
0,dragon


In [75]:
data = [['1', '1', '10', '1', 'completed', '2013-10-01'], ['2', '2', '11', '1', 'cancelled_by_driver', '2013-10-01'], ['3', '3', '12', '6', 'completed', '2013-10-01'], ['4', '4', '13', '6', 'cancelled_by_client', '2013-10-01'], ['5', '1', '10', '1', 'completed', '2013-10-02'], ['6', '2', '11', '6', 'completed', '2013-10-02'], ['7', '3', '12', '6', 'completed', '2013-10-02'], ['8', '2', '12', '12', 'completed', '2013-10-03'], ['9', '3', '10', '12', 'completed', '2013-10-03'], ['10', '4', '13', '12', 'cancelled_by_driver', '2013-10-03']]
trips = pd.DataFrame(data, columns=['id', 'client_id', 'driver_id', 'city_id', 'status', 'request_at']).astype({'id':'Int64', 'client_id':'Int64', 'driver_id':'Int64', 'city_id':'Int64', 'status':'object', 'request_at':'object'})

data = [['1', 'No', 'client'], ['2', 'Yes', 'client'], ['3', 'No', 'client'], ['4', 'No', 'client'], ['10', 'No', 'driver'], ['11', 'No', 'driver'], ['12', 'No', 'driver'], ['13', 'No', 'driver']]
users = pd.DataFrame(data, columns=['users_id', 'banned', 'role']).astype({'users_id':'Int64', 'banned':'object', 'role':'object'})


query = """  
WITH cte AS (
    SELECT 
        t.id, 
        t.client_id, 
        t.driver_id, 
        t.city_id, 
        t.status, 
        t.request_at, 
        uc.banned AS client_banned, 
        ud.banned AS driver_banned
    FROM trips t
    INNER JOIN users uc ON t.client_id = uc.users_id  -- Join for client information
    INNER JOIN users ud ON t.driver_id = ud.users_id  -- Join for driver information
    WHERE t.request_at BETWEEN '2013-10-01' AND '2013-10-03'
),


cte2 AS (SELECT * FROM cte WHERE client_banned = 'No' AND driver_banned = 'No'),

cte3 AS (SELECT 
*, 
COUNT() OVER(PARTITION BY request_at ORDER BY request_at) AS gg 
FROM cte2)


SELECT 
request_at AS "Day" , 
COUNT(CASE WHEN status IN ('cancelled_by_driver', 'cancelled_by_client') THEN 1 END) * 1.0 /gg AS "Cancellation Rate"
FROM cte3
GROUP BY request_at, gg


"""

make_select(query)

Unnamed: 0,Day,Cancellation Rate
0,2013-10-01,0.333333
1,2013-10-02,0.0
2,2013-10-03,0.5


In [89]:
data = [[1, '2017-01-01', 10], [2, '2017-01-02', 109], [3, '2017-01-03', 150], [4, '2017-01-04', 99], [5, '2017-01-05', 145], [6, '2017-01-06', 1455], [7, '2017-01-07', 199], [8, '2017-01-09', 188]]
stadium = pd.DataFrame(data, columns=['id', 'visit_date', 'people']).astype({'id':'Int64', 'visit_date':'datetime64[ns]', 'people':'Int64'})


query = """  
WITH cte AS (
SELECT *
FROM stadium WHERE people >= 100)

SELECT  
*,
LEAD(id) OVER(ORDER BY id) AS conn
FROM cte

"""

make_select(query)


Unnamed: 0,id,visit_date,people,conn
0,2,2017-01-02 00:00:00.000000,109,3.0
1,3,2017-01-03 00:00:00.000000,150,5.0
2,5,2017-01-05 00:00:00.000000,145,6.0
3,6,2017-01-06 00:00:00.000000,1455,7.0
4,7,2017-01-07 00:00:00.000000,199,8.0
5,8,2017-01-09 00:00:00.000000,188,


In [94]:
from datetime import datetime

data = [[1, 20, '2019-08-14'], [2, 50, '2019-08-14'], [1, 30, '2019-08-15'], [1, 35, '2019-08-16'], [2, 65, '2019-08-17'], [3, 20, '2019-08-18']]
products = pd.DataFrame(data, columns=['product_id', 'new_price', 'change_date']).astype({'product_id':'Int64', 'new_price':'Int64', 'change_date':'datetime64[ns]'})


products.groupby(["product_id", "change_date"])["new_price"].first().reset_index()

Unnamed: 0,product_id,change_date,new_price
0,1,2019-08-14,20
1,1,2019-08-15,30
2,1,2019-08-16,35
3,2,2019-08-14,50
4,2,2019-08-17,65
5,3,2019-08-18,20


In [136]:
calls = pd.DataFrame({
    "caller_id": [8, 4, 5, 8, 11, 8],
    "recipient_id": [4, 8, 1, 3, 3, 11],
    "call_time": [
        "2021-08-24 22:46:07",
        "2021-08-24 22:57:13",
        "2021-08-11 21:28:44",
        "2021-08-17 22:04:15",
        "2021-08-17 13:07:00",
        "2021-08-17 14:22:22",
    ],
    "city": ["Houston", "Houston", "Houston", "Houston", "New York", "New York"]
})


calls["call_time"] = pd.to_datetime(calls["call_time"])

query = """  
WITH cte AS (SELECT 
*,
strftime('%H', call_time) AS hours
FROM calls),

cte2 AS (SELECT 
*,
COUNT(*) OVER(PARTITION BY hours) AS calls
FROM cte),

cte3 AS (SELECT
*,
DENSE_RANK() OVER(PARTITION BY city ORDER BY calls DESC) AS ranking
FROM cte2)


SELECT 
city, hours, calls
FROM cte3 
WHERE ranking = 1
GROUP BY city, hours, calls

"""
make_select(query)

Unnamed: 0,city,hours,calls
0,Houston,22,3
1,New York,13,1
2,New York,14,1


In [139]:
data = [[1, 8000, 'Jan'], [2, 9000, 'Jan'], [3, 10000, 'Feb'], [1, 7000, 'Feb'], [1, 6000, 'Mar']]
department = pd.DataFrame(data, columns=['id', 'revenue', 'month']).astype({'id':'Int64', 'revenue':'Int64', 'month':'object'})


department.melt(id_vars="id")

Unnamed: 0,id,variable,value
0,1,revenue,8000
1,2,revenue,9000
2,3,revenue,10000
3,1,revenue,7000
4,1,revenue,6000
5,1,month,Jan
6,2,month,Jan
7,3,month,Feb
8,1,month,Feb
9,1,month,Mar


In [147]:
data = [[900001, 'Alice'], [900002, 'Bob'], [900003, 'Charlie']]
users = pd.DataFrame(data, columns=['account', 'name']).astype({'account':'Int64', 'name':'object'})
data = [[1, 900001, 7000, '2020-08-01'], [2, 900001, 7000, '2020-09-01'], [3, 900001, -3000, '2020-09-02'], [4, 900002, 1000, '2020-09-12'], [5, 900003, 6000, '2020-08-07'], [6, 900003, 6000, '2020-09-07'], [7, 900003, -4000, '2020-09-11']]
transactions = pd.DataFrame(data, columns=['trans_id', 'account', 'amount', 'transacted_on']).astype({'trans_id':'Int64', 'account':'Int64', 'amount':'Int64', 'transacted_on':'datetime64[ns]'})


query = """   

SELECT 
u.name, 
SUM(t.amount) AS balance
FROM users u INNER JOIN transactions t ON u.account = t.account
GROUP BY u.name
HAVING SUM(t.amount) > 10000
"""

make_select(query)

Unnamed: 0,name,balance
0,Alice,11000


In [177]:
data = [[1, 8000, 'Jan'], [2, 9000, 'Jan'], [3, 10000, 'Feb'], [1, 7000, 'Feb'], [1, 6000, 'Mar']]
department = pd.DataFrame(data, columns=['id', 'revenue', 'month']).astype({'id':'Int64', 'revenue':'Int64', 'month':'object'})


query = """  
SELECT 
id,
SUM(CASE WHEN month = 'Jan' THEN revenue END) AS Jan_Revenue,
SUM(CASE WHEN month = 'Feb' THEN revenue END) AS Feb_Revenue,
SUM(CASE WHEN month = 'Mar' THEN revenue END) AS Mar_Revenue,
SUM(CASE WHEN month = 'Apr' THEN revenue END) AS Apr_Revenue,
SUM(CASE WHEN month = 'May' THEN revenue END) AS May_Revenue,
SUM(CASE WHEN month = 'Jun' THEN revenue END) AS Jun_Revenue,
SUM(CASE WHEN month = 'Jul' THEN revenue END) AS Jul_Revenue,
SUM(CASE WHEN month = 'Aug' THEN revenue END) AS Aug_Revenue,
SUM(CASE WHEN month = 'Sep' THEN revenue END) AS Sep_Revenue,
SUM(CASE WHEN month = 'Oct' THEN revenue END) AS Oct_Revenue,
SUM(CASE WHEN month = 'Nov' THEN revenue END) AS Nov_Revenue,
SUM(CASE WHEN month = 'Dec' THEN revenue END) AS Dec_Revenue
FROM department
GROUP BY id
"""

make_select(query)

Unnamed: 0,id,Jan_Revenue,Feb_Revenue,Mar_Revenue,Apr_Revenue,May_Revenue,Jun_Revenue,Jul_Revenue,Aug_Revenue,Sep_Revenue,Oct_Revenue,Nov_Revenue,Dec_Revenue
0,1,8000.0,7000.0,6000.0,,,,,,,,,
1,2,9000.0,,,,,,,,,,,
2,3,,10000.0,,,,,,,,,,


In [198]:
data = [[1, 2, '2016-03-01', 5], [1, 2, '2016-03-02', 6], [2, 3, '2017-06-25', 1], [3, 1, '2016-03-02', 0], [3, 4, '2018-07-03', 5]]
activity = pd.DataFrame(data, columns=['player_id', 'device_id', 'event_date', 'games_played']).astype({'player_id':'Int64', 'device_id':'Int64', 'event_date':'datetime64[ns]', 'games_played':'Int64'})

query = """  
WITH cte AS (SELECT 
*,
MIN(event_date) OVER(PARTITION BY player_id) AS firstday_login
FROM activity)

SELECT 
ROUND(COUNT(CASE WHEN julianday(event_date) - julianday(firstday_login) = 1 THEN 1 END) * 1.0 / (SELECT COUNT(DISTINCT player_id) FROM cte), 2) AS fraction
FROM cte
"""

make_select(query)

Unnamed: 0,fraction
0,0.33


In [16]:

users = pd.DataFrame({
    "seller_id": [1, 2, 3],
    "join_date": ["2019-01-01", "2019-02-09", "2019-01-19"],
    "favorite_brand": ["Lenovo", "Samsung", "LG"]
})
items = pd.DataFrame({
    "item_id": [1, 2, 3, 4],
    "item_brand": ["Samsung", "Lenovo", "LG", "HP"]
})
orders = pd.DataFrame({
    "order_id": [1, 2, 3, 4, 5],
    "order_date": ["2019-08-01", "2019-08-02", "2019-08-03", "2019-08-04", "2019-08-04"],
    "item_id": [4, 2, 3, 1, 4],
    "seller_id": [2, 3, 3, 2, 2]
})


users["join_date"] = pd.to_datetime(users["join_date"])
orders["order_date"] = pd.to_datetime(orders["order_date"])


query = """  
WITH cte AS (SELECT
o.*,
u.favorite_brand,
i.item_brand 
FROM orders o 
INNER JOIN users u ON o.seller_id = u.seller_id 
INNER JOIN items i ON o.item_id = i.item_id),

cte2 AS (SELECT 
*,
COUNT(item_brand) OVER(PARTITION BY seller_id) AS total_sold
FROM cte)

SELECT 
seller_id, 
total_sold-COUNT(*) AS num_items
FROM cte2
WHERE favorite_brand != item_brand
GROUP BY seller_id
"""

make_select(query)


Unnamed: 0,seller_id,num_items
0,2,1
1,3,1


In [6]:
scores = pd.DataFrame({
    "student_id": [309, 321, 338, 423, 896, 235],
    "student_name": ["Owen", "Claire", "Julian", "Peyton", "David", "Camila"],
    "assignment1": [88, 98, 100, 60, 32, 31],
    "assignment2": [47, 95, 64, 44, 37, 53],
    "assignment3": [87, 37, 43, 47, 50, 69]
})


query = """  
SELECT MAX(total_score) - MIN(total_score) AS difference FROM (SELECT 
*,
assignment1 + assignment2 + assignment3 AS total_score
FROM scores)
"""

make_select(query)

Unnamed: 0,difference
0,111


In [24]:
transactions = pd.DataFrame({
    "transaction_id": [8, 9, 1, 5, 6],
    "day": ["2021-4-3 15:57:28", "2021-4-28 08:47:25", "2021-4-29 13:28:30", "2021-4-28 16:39:59", "2021-4-29 23:39:28"],
    "amount": [57, 21, 58, 40, 58]
}
)


transactions['day'] = pd.to_datetime(transactions['day'])

query = """  

WITH cte AS (SELECT
*
FROM (SELECT 
*,
RANK() OVER(PARTITION BY DATE(day) ORDER BY amount DESC) AS ranking
FROM transactions) AS sub_query)

SELECT 
transaction_id
FROM cte
WHERE ranking = 1
ORDER BY transaction_id
"""

make_select(query)

Unnamed: 0,transaction_id
0,1
1,5
2,6
3,8


In [36]:
transactions_df = pd.DataFrame({
    "account_id": [1, 1, 1, 2, 2],
    "day": ["2021-11-07", "2021-11-09", "2021-11-11", "2021-12-07", "2021-12-12"],
    "type": ["Deposit", "Withdraw", "Deposit", "Deposit", "Withdraw"],
    "amount": [2000, 1000, 3000, 7000, 7000]
})

transactions_df['day'] = pd.to_datetime(transactions_df['day'])

query = """  

SELECT 
account_id,
day,
type,
SUM(
CASE WHEN type = 'Deposit' THEN amount ELSE -amount END
) OVER(PARTITION BY account_id ORDER BY day) AS balance
FROM transactions_df

"""

make_select(query)

Unnamed: 0,account_id,day,type,balance
0,1,2021-11-07 00:00:00.000000,Deposit,2000
1,1,2021-11-09 00:00:00.000000,Withdraw,1000
2,1,2021-11-11 00:00:00.000000,Deposit,4000
3,2,2021-12-07 00:00:00.000000,Deposit,7000
4,2,2021-12-12 00:00:00.000000,Withdraw,0


In [46]:
tasks_df = pd.DataFrame({
    "task_id": [1, 2, 3, 4, 5, 6],
    "assignee_id": [1, 6, 6, 3, 5, 7],
    "submit_date": ["2022-06-13", "2022-06-14", "2022-06-15", "2022-06-18", "2022-06-19", "2022-06-19"]
}
)
tasks_df['submit_date'] = pd.to_datetime(tasks_df['submit_date'])

query = """  

SELECT 
COUNT(CASE WHEN  week IN ('0','6') THEN assignee_id END) AS weekday_cnt ,
COUNT(CASE WHEN  week BETWEEN '1' AND '5' THEN assignee_id END) AS weekend_cnt 
FROM (SELECT 
*,
strftime('%w', submit_date) AS week
FROM tasks_df) AS sub_query
"""

make_select(query)

Unnamed: 0,weekday_cnt,weekend_cnt
0,3,3


In [62]:
calls = pd.DataFrame({
    "from_id": [1, 2, 1, 3, 3, 3, 4],
    "to_id": [2, 1, 3, 4, 4, 4, 3],
    "duration": [59, 11, 20, 100, 200, 200, 499]
})

query = """  
SELECT 
CASE WHEN from_id < to_id THEN from_id ELSE to_id END AS person1,
CASE WHEN from_id < to_id THEN to_id ELSE from_id END AS person1,
COUNT(*) AS total_calls,
SUM(duration) AS total_duration
FROM calls
GROUP BY 1,2
"""

make_select(query)

Unnamed: 0,person1,person1.1,total_calls,total_duration
0,1,2,2,70
1,1,3,1,20
2,3,4,4,999


In [92]:
orders_details_df = pd.DataFrame({
    "order_id": [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 3, 2],
    "product_id": [1, 2, 3, 1, 4, 5, 3, 4, 5, 6, 7, 8, 9, 9],
    "quantity": [12, 10, 15, 8, 4, 6, 5, 18, 2, 8, 9, 9, 20, 4]
})

query = """

WITH cte AS (SELECT 
*,
AVG(quantity) OVER(PARTITION BY order_id) AS avg_price,
MAX(quantity) OVER(PARTITION BY order_id) AS max_price
FROM orders_details_df
ORDER BY avg_price DESC)


SELECT order_id FROM cte
WHERE max_price > (SELECT MAX(avg_price) FROM cte)
GROUP BY order_id


"""

make_select(query)

Unnamed: 0,order_id
0,1
1,3


In [157]:
members_df = pd.DataFrame({
    "member_id": [9, 11, 3, 8, 1],
    "name": ["Alice", "Bob", "Winston", "Hercy", "Narihan"]
})
visits_df = pd.DataFrame({
    "visit_id": [22, 16, 18, 19, 12, 17, 21],
    "member_id": [11, 11, 9, 3, 11, 8, 9],
    "visit_date": ["2021-10-28", "2021-01-12", "2021-12-10", "2021-10-19", "2021-03-01", "2021-05-07", "2021-05-12"]
})
purchases_df = pd.DataFrame({
    "visit_id": [12, 18, 17],
    "charged_amount": [2000, 9000, 7000]
})

# cte2 AS (SELECT 
# * ,
# ROUND(COUNT(CASE WHEN visit_id = vs_id THEN 1 END) OVER(PARTITION BY member_id) * 1.0 
# /COUNT(member_id) OVER(PARTITION BY member_id) * 100,2) AS score 
# FROM cte)

# SELECT 
# *,
# CASE 
#     WHEN score >= 80 THEN 'Diamond'
#     WHEN score >= 50 THEN 'Gold'
#     WHEN score >=1 AND score <= 50 THEN 'Silver'
#     WHEN score = 0 THEN 'Bronze'
# END AS category
# FROM (
# SELECT 
# member_id, name, 
# score
# FROM cte2 
# GROUP BY member_id, name, score
# )

query = """  

WITH cte AS (SELECT 
m.member_id, m.name,v.visit_date, p.visit_id,v.visit_id AS vs_id, p.charged_amount
FROM visits_df v 
FULL OUTER JOIN purchases_df p ON v.visit_id = p.visit_id 
FULL OUTER JOIN members_df m ON v.member_id = m.member_id)

SELECT 
member_id, name,
CASE 
    WHEN visit_date IS NULL THEN 'Bronze'
    WHEN COUNT(charged_amount) *1.0 / COUNT(vs_id) * 100 >= 80 THEN 'Diamond'
    WHEN COUNT(charged_amount) *1.0 / COUNT(vs_id) * 100 >= 50 THEN 'Gold'
    ELSE 'Silver'
END AS category
FROM cte
GROUP BY member_id, name
"""

make_select(query)

Unnamed: 0,member_id,name,category
0,1,Narihan,Bronze
1,3,Winston,Silver
2,8,Hercy,Diamond
3,9,Alice,Gold
4,11,Bob,Silver


In [182]:
enrollments_df = pd.DataFrame({
    'student_id': [2, 2, 1, 1, 3, 3, 3],
    'course_id': [2, 3, 1, 2, 1, 2, 3],
    'grade': [95, 95, 90, 99, 80, 75, 82]
})

query = """  

SELECT 
student_id, course_id, grade
FROM (SELECT * ,
RANK() OVER(PARTITION BY student_id ORDER BY grade DESC, course_id) AS ranking
FROM enrollments_df
ORDER BY course_id
) AS sub
WHERE ranking = 1


"""

make_select(query)

Unnamed: 0,student_id,course_id,grade
0,1,2,99
1,2,2,95
2,3,3,82


In [204]:
contacts = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "first_name": ["John", "Jane", "Alice", "Michael", "Emily"],
    "last_name": ["Doe", "Smith", "Johnson", "Brown", "Davis"]
})

calls = pd.DataFrame({
    "contact_id": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
    "type": ["incoming", "outgoing", "incoming", "outgoing", "incoming", "outgoing", "incoming", "outgoing", "incoming", "outgoing"],
    "duration": [120, 180, 300, 240, 150, 360, 420, 200, 180, 280]
})


query = """  

WITH cte AS (SELECT 
*,
RANK() OVER(PARTITION BY type ORDER BY duration DESC) AS ranking
FROM (SELECT 
first_name,  type,
MAX(duration) / 60 AS duration
FROM contacts c INNER JOIN calls cl ON c.id = cl.contact_id
GROUP BY  first_name, type
ORDER BY type) AS sub)


SELECT first_name,  type ,
strftime('%H:%M:%S', duration * 60, 'unixepoch') AS duration
FROM cte
WHERE ranking <=3
"""

make_select(query)

Unnamed: 0,first_name,type,duration
0,Michael,incoming,00:07:00
1,Jane,incoming,00:05:00
2,Emily,incoming,00:03:00
3,Alice,outgoing,00:06:00
4,Emily,outgoing,00:04:00
5,Jane,outgoing,00:04:00


In [237]:
df = pd.DataFrame({
    "account_id": [1, 1, 2, 2, 3, 3, 4, 4],
    "ip_address": [1, 2, 6, 7, 9, 13, 10, 11],
    "login": [
        "2021-02-01 09:00:00",
        "2021-02-01 08:00:00",
        "2021-02-01 20:30:00",
        "2021-02-02 20:30:00",
        "2021-02-01 16:00:00",
        "2021-02-01 17:00:00",
        "2021-02-01 16:00:00",
        "2021-02-01 17:00:00",
    ],
    "logout": [
        "2021-02-01 09:30:00",
        "2021-02-01 11:30:00",
        "2021-02-01 22:00:00",
        "2021-02-02 22:00:00",
        "2021-02-01 16:59:59",
        "2021-02-01 17:59:59",
        "2021-02-01 17:00:00",
        "2021-02-01 17:59:59",
    ],
})
df["login"] = pd.to_datetime(df["login"])
df["logout"] = pd.to_datetime(df["logout"])





In [None]:
servers = pd.DataFrame({
    'server_id': [3, 3, 3, 3, 1, 1, 4, 4, 4, 4, 1, 1, 3, 3, 4, 4, 4, 4, 5, 5],
    'status_time': [
        '2023-11-04 16:29:47', '2023-11-05 01:49:47', '2023-11-25 01:37:08', '2023-11-25 03:50:08',
        '2023-11-13 03:05:31', '2023-11-13 11:10:31', '2023-11-29 15:11:17', '2023-11-29 15:42:17',
        '2023-11-20 00:31:44', '2023-11-20 07:03:44', '2023-11-20 00:27:11', '2023-11-20 01:41:11',
        '2023-11-04 23:16:48', '2023-11-05 01:15:48', '2023-11-30 15:09:18', '2023-11-30 20:48:18',
        '2023-11-25 21:09:06', '2023-11-26 04:58:06', '2023-11-16 19:42:22', '2023-11-16 21:08:22'
    ],
    'session_status': [
        'start', 'stop', 'start', 'stop', 'start', 'stop', 'start', 'stop', 'start', 'stop',
        'start', 'stop', 'start', 'stop', 'start', 'stop', 'start', 'stop', 'start', 'stop'
    ]
})

servers['status_time'] = pd.to_datetime(servers['status_time'])

query = """  

WITH cte AS (SELECT 
*,
ROUND((julianday(status_time) - julianday(prev_day) ) * 24, 2) AS hours_worked
FROM (SELECT 
*,
LAG(CASE WHEN session_status = 'start' THEN status_time END) OVER(PARTITION BY server_id) AS prev_day
FROM servers) AS sub)

SELECT FLOOR(SUM(hours_worked) / 24) AS total_uptime_days FROM cte

"""

make_select(query)

Unnamed: 0,total_uptime_days
0,1


In [323]:
team_points = pd.DataFrame({
    'team_id': [3, 1, 2, 4],
    'name': ['Algeria', 'Senegal', 'New Zealand', 'Croatia'],
    'points': [1431, 2132, 1402, 1817]
})

points_change = pd.DataFrame({
    'team_id': [3, 2, 4, 1],
    'points_change': [399, 0, 13, -22]
})


query = """  

WITH cte AS (SELECT 
t.team_id, t.name, t.points, p.points_change
FROM team_points t 
INNER JOIN points_change p ON t.team_id = p.team_id),

ranking_before_change AS (SELECT 
team_id, name, points,
RANK() OVER(ORDER BY points DESC) AS ranking
FROM cte),


ranking_after_change AS (SELECT 
team_id, name,points + points_change AS after_points,
ROW_NUMBER() OVER(ORDER BY points + points_change DESC) AS ranking_after
FROM cte)

SELECT 
b.team_id, b.name, b.points,
ranking  - ranking_after AS rank_diff
FROM ranking_before_change b
INNER JOIN ranking_after_change a ON b.team_id = a.team_id

"""
make_select(query)

Unnamed: 0,team_id,name,points,rank_diff
0,1,Senegal,2132,0
1,4,Croatia,1817,-1
2,3,Algeria,1431,1
3,2,New Zealand,1402,0


In [335]:
ads = pd.DataFrame({
    'ad_id': [1, 2, 3, 5, 1, 2, 3, 1, 2, 1],
    'user_id': [1, 2, 3, 5, 7, 7, 5, 4, 11, 2],
    'action': ['Clicked', 'Clicked', 'Viewed', 'Ignored', 'Ignored', 'Viewed', 'Clicked', 'Viewed', 'Viewed', 'Clicked']
})


query = """  

SELECT 
ad_id, 
    CASE 
        WHEN COALESCE((clicked_total + viewed_total), 0) = 0 THEN 0
        ELSE ROUND(COALESCE(clicked_total, 0) * 1.0 / COALESCE((clicked_total + viewed_total), 0) * 100, 2)
    END AS ctr
FROM  (SELECT 
ad_id,
COUNT(CASE WHEN action= 'Clicked' THEN user_id END) AS clicked_total,
COUNT(CASE WHEN action= 'Viewed' THEN user_id END) AS viewed_total
FROM ads
GROUP BY ad_id) AS sub_query
GROUP BY ad_id
"""

make_select(query)

Unnamed: 0,ad_id,ctr
0,1,66.67
1,2,33.33
2,3,50.0
3,5,0.0


In [341]:
students = pd.DataFrame({
    "student_id": [1, 2, 3],
    "student_name": ["Jack", "Jane", "Mark"],
    "gender": ["M", "F", "M"],
    "dept_id": [1, 1, 2]
})

departments = pd.DataFrame({
    "dept_id": [1, 2, 3],
    "dept_name": ["Engineering", "Science", "Law"]
})

query = """  

SELECT 
d.dept_name, 
COUNT(s.student_id) AS student_number
FROM students s 
RIGHT JOIN departments d ON s.dept_id = d.dept_id
GROUP BY d.dept_name
ORDER BY student_number DESC

"""

make_select(query)

Unnamed: 0,dept_name,student_number
0,Engineering,2
1,Science,1
2,Law,0


In [25]:

activities_df = pd.DataFrame({
    "activity_id": [7274, 2425, 1413, 2536, 8564, 5235, 4251, 1435],
    "user_id": [123, 123, 456, 456, 456, 789, 123, 789],
    "activity_type": ["open", "send", "send", "open", "send", "send", "open", "open"],
    "time_spent": [4.50, 3.50, 5.67, 3.00, 8.24, 6.24, 1.25, 5.25]
})

age_df = pd.DataFrame({
    "user_id": [123, 789, 456],
    "age_bucket": ["31-35", "21-25", "26-30"]
})


query = """

WITH cte AS (SELECT 
a.activity_id, a.user_id, a.activity_type, a.time_spent, g.age_bucket 
FROM activities_df a 
INNER JOIN age_df g 
ON a.user_id = g.user_id)


SELECT 
age_bucket,
ROUND(total_spent / (total_spent + total_open) * 100, 2) AS send_perc,
ROUND(total_open / (total_spent + total_open) * 100, 2)  AS open_perc 
FROM
(SELECT 
age_bucket,
SUM(CASE WHEN activity_type = 'send' THEN time_spent END) *1.0 AS total_spent,
SUM(CASE WHEN activity_type = 'open' THEN time_spent END) *1.0  AS total_open
FROM cte
GROUP BY age_bucket)
ORDER BY age_bucket DESC
"""


sqldf(query)



Unnamed: 0,age_bucket,send_perc,open_perc
0,31-35,37.84,62.16
1,26-30,82.26,17.74
2,21-25,54.31,45.69


In [None]:
team_stats = pd.DataFrame({
    "team_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "team_name": [
        "Chelsea", "Nottingham Forest", "Liverpool", "Aston Villa", 
        "Fulham", "Burnley", "Newcastle United", "Sheffield United", 
        "Luton Town", "Everton"
    ],
    "matches_played": [22, 27, 17, 20, 31, 26, 33, 20, 5, 14],
    "wins": [13, 6, 1, 1, 18, 6, 11, 18, 4, 2],
    "draws": [2, 6, 8, 6, 1, 9, 10, 2, 0, 6],
    "losses": [7, 15, 8, 13, 12, 11, 12, 0, 1, 6]
})

query = """  


WITH cte AS (SELECT 
*,
RANK() OVER(ORDER BY total_score DESC) AS ranking
FROM (SELECT 
team_name,
wins * 3 + draws * 1 + losses * 0 AS total_score
FROM team_stats 
GROUP BY team_name))


SELECT 
*,
PERCENTILE_DISC(0.33) WITHIN GROUP (ORDER BY total_score DESC) AS p33
FROM cte
GROUP BY team_name

"""

make_select(query)

In [12]:
customers = pd.DataFrame({
    "customer_id": [1, 4, 5],
    "customer_name": ["Alice", "Bob", "Charlie"]
})


customer_max = customers["customer_id"].max()
customer_min = customers["customer_id"].min()
customes_ids = customers["customer_id"].unique().tolist()

customer_range = list(range(customer_min, customer_max+1))



res = []

for id in customer_range:
    if id not in customes_ids:
        res.append(id)


pd.DataFrame({
    "ids": res
})


Unnamed: 0,ids
0,2
1,3


In [None]:
friends = pd.DataFrame({
    "id": [1, 2, 3, 4, 5, 6],
    "name": ["Jonathan D.", "Jade W.", "Victor J.", "Elvis Q.", "Daniel A.", "Bob B."],
    "activity": ["Eating", "Singing", "Singing", "Eating", "Eating", "Horse Riding"]
})


activities = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Eating", "Singing", "Horse Riding"]
})



query = """   


WITH cte2 AS (SELECT 
*,
COUNT(friend_name) OVER(PARTITION BY activity_type) AS total_friends
FROM (SELECT 
f.id, f.name AS friend_name, f.activity AS activity_type, a.id AS activity_id
FROM friends f INNER JOIN activities a ON f.activity = a.name
)
)


SELECT 
activity_type AS ativity
FROM cte2
WHERE total_friends != (SELECT MIN(total_friends) FROM cte2) AND total_friends != (SELECT MAX(total_friends) FROM cte2)
GROUP BY activity_type



"""


make_select(query)

Unnamed: 0,ativity
0,Singing


In [34]:
project = pd.DataFrame({
    "project_id": [1, 1, 1, 2, 2],
    "employee_id": [1, 2, 3, 1, 4]
})


employee = pd.DataFrame({
    "employee_id": [1, 2, 3, 4],
    "name": ["Khaled", "Ali", "John", "Doe"],
    "experience_years": [3, 2, 1, 2]
})


query = """  

WITH cte AS (SELECT 
p.project_id, 
COUNT(DISTINCT e.employee_id) AS total_employees
FROM project p INNER JOIN employee e ON p.employee_id = e.employee_id
GROUP BY p.project_id)


SELECT project_id FROM cte WHERE total_employees = (SELECT MAX(total_employees) FROM cte)

"""

make_select(query)

Unnamed: 0,project_id
0,1


In [54]:
employees = pd.DataFrame({
    "employee_id": [2, 3, 7, 8, 9],
    "name": ["Meir", "Michael", "Addilyn", "Juan", "Kannon"],
    "salary": [3000, 3000, 7400, 6100, 7400]
})


query = """  

WITH cte AS (SELECT 
salary, COUNT(*) AS total_count
FROM employees
GROUP BY salary)



SELECT 
e.employee_id, e.name, e.salary,
DENSE_RANK() OVER(ORDER BY c.salary) AS team_id
FROM cte c INNER JOIN employees e ON c.salary = e.salary
WHERE c.total_count >= 2
"""

make_select(query)

Unnamed: 0,employee_id,name,salary,team_id
0,2,Meir,3000,1
1,3,Michael,3000,1
2,7,Addilyn,7400,2
3,9,Kannon,7400,2


In [66]:
candidate = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["A", "B", "C", "D", "E"]
})


vote = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "candidateId": [2, 4, 3, 2, 5]
})
query = """  

WITH cte AS (SELECT c.name, c.id,
COUNT(v.id) AS total_votes
FROM candidate c INNER JOIN vote v ON c.id = v.candidateId
GROUP BY c.name, c.id)


SELECT name FROM cte WHERE total_votes = (SELECT MAX(total_votes) FROM cte)

"""

make_select(query)

Unnamed: 0,name
0,B


In [73]:
data = [[309, 'Owen', 88, 47, 87], [321, 'Claire', 98, 95, 37], [338, 'Julian', 100, 64, 43], [423, 'Peyton', 60, 44, 47], [896, 'David', 32, 37, 50], [235, 'Camila', 31, 53, 69]]
scores = pd.DataFrame(data, columns=['student_id', 'student_name', 'assignment1', 'assignment2', 'assignment3']).astype({'student_id':'Int64', 'student_name':'object', 'assignment1':'Int64', 'assignment2':'Int64', 'assignment3':'Int64'})


query = """ 

SELECT 
MAX(total_score) - MIN(total_score) AS difference_in_score
FROM
(SELECT 
student_id,
assignment1 + assignment2 + assignment3 AS total_score
FROM scores) AS sub_query

"""

make_select(query)

Unnamed: 0,difference_in_score
0,111


In [77]:
data = [[1, 'Electrical Engineering'], [7, 'Computer Engineering'], [13, 'Bussiness Administration']]
departments = pd.DataFrame(data, columns=['id', 'name']).astype({'id':'Int64', 'name':'object'})
data = [[23, 'Alice', 1], [1, 'Bob', 7], [5, 'Jennifer', 13], [2, 'John', 14], [4, 'Jasmine', 77], [3, 'Steve', 74], [6, 'Luis', 1], [8, 'Jonathan', 7], [7, 'Daiana', 33], [11, 'Madelynn', 1]]
students = pd.DataFrame(data, columns=['id', 'name', 'department_id']).astype({'id':'Int64', 'name':'object', 'department_id':'Int64'})


query = """  

SELECT s.id, s.name FROM students s FULL OUTER JOIN departments d ON s.department_id = d.id WHERE d.id IS NULL

"""

make_select(query)

Unnamed: 0,id,name
0,2,John
1,4,Jasmine
2,3,Steve
3,7,Daiana
