In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
sqlite3.sqlite_version

'3.37.2'

In [3]:
df = pd.read_csv('german_credit_augmented.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358


In [5]:
con = sqlite3.connect('db')

In [6]:
df['contract_dt'] = pd.to_datetime(df['contract_dt'],format='%Y-%m-%d %H:%M:%S')

In [7]:
df.dtypes

age                          int64
sex                         object
job                          int64
housing                     object
saving_accounts             object
checking_account            object
credit_amount                int64
duration                     int64
purpose                     object
default                      int64
contract_dt         datetime64[ns]
client_id                    int64
dtype: object

In [8]:
df.to_sql('german_credit',con,index=False,if_exists='replace')

1000

In [9]:
def select(sql):
    return pd.read_sql(sql,con)

## SELECT

In [10]:
sql = ''' 
SELECT * 
FROM german_credit AS gc
''' 

In [11]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358
...,...,...,...,...,...,...,...,...,...,...,...,...
995,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
996,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
997,33,male,2,own,little,moderate,6403,24,radio/TV,0,2008-04-08 03:24:26,730
998,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


In [18]:
sql = ''' 
SELECT
    gc.age, gc.housing
FROM german_credit AS gc
''' 

In [19]:
select(sql)

Unnamed: 0,age,housing
0,33,own
1,43,own
2,52,own
3,35,own
4,28,own
...,...,...
995,65,free
996,30,own
997,33,own
998,29,own


In [16]:
sql = ''' 
SELECT 
    gc.age, gc.age * 2 AS age_mult2, gc.housing
FROM german_credit AS gc
'''

In [17]:
select(sql)

Unnamed: 0,age,age_mult2,housing
0,33,66,own
1,43,86,own
2,52,104,own
3,35,70,own
4,28,56,own
...,...,...,...
995,65,130,free
996,30,60,own
997,33,66,own
998,29,58,own


In [20]:
sql = ''' 
SELECT 
    gc.age, 
    gc.age * 2 AS age_mult2,
    gc.housing,
    gc.age + gc.credit_amount AS age_plus_amount,
    gc.age * 1.0 / gc.credit_amount AS age_div_amount
    
FROM german_credit AS gc
''' 

In [21]:
select(sql)

Unnamed: 0,age,age_mult2,housing,age_plus_amount,age_div_amount
0,33,66,own,3107,0.010735
1,43,86,own,1387,0.031994
2,52,104,own,988,0.055556
3,35,70,own,1428,0.025126
4,28,56,own,804,0.036082
...,...,...,...,...,...
995,65,130,free,2665,0.025000
996,30,60,own,4485,0.006734
997,33,66,own,6436,0.005154
998,29,58,own,5032,0.005797


In [22]:
sql = '''
SELECT *
FROM german_credit AS gc limit 5
'''

In [23]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358


## Where

In [24]:
sql = '''
SELECT count(1) 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
'''

In [25]:
select(sql)

Unnamed: 0,count(1)
0,573


In [26]:
sql = '''
SELECT * 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
'''

In [27]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
1,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
2,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358
3,29,female,2,own,little,moderate,959,9,furniture/equipment,1,2007-05-21 14:32:00,351
4,26,male,2,own,little,little,4370,42,radio/TV,1,2007-11-29 00:20:44,639
...,...,...,...,...,...,...,...,...,...,...,...,...
568,45,male,1,own,quite rich,,1750,6,radio/TV,0,2007-05-25 13:59:51,551
569,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
570,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
571,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


In [28]:
sql = '''
SELECT * 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
ORDER BY gc.contract_dt
'''

In [29]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,20,male,2,own,,little,2996,24,furniture/equipment,1,2007-05-01 07:17:29,495
1,49,female,2,own,little,moderate,1092,12,radio/TV,0,2007-05-01 08:41:46,288
2,30,female,3,own,little,moderate,4795,36,radio/TV,0,2007-05-01 12:42:45,141
3,39,female,1,own,,moderate,932,6,education,0,2007-05-01 20:28:44,215
4,31,male,2,own,little,,2775,18,car,1,2007-05-01 23:14:02,598
...,...,...,...,...,...,...,...,...,...,...,...,...
568,30,male,2,own,little,moderate,639,12,repairs,1,2007-12-28 17:09:43,127
569,36,male,3,rent,,moderate,7057,20,car,0,2007-12-29 16:10:08,99
570,23,female,2,rent,little,little,2406,30,furniture/equipment,1,2007-12-29 20:14:30,359
571,35,male,1,own,little,,2684,24,radio/TV,0,2007-12-30 00:40:33,256


In [30]:
sql = '''
SELECT * 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
ORDER BY gc.contract_dt DESC
'''

In [31]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,31,female,1,own,little,,1736,12,furniture/equipment,0,2007-12-30 13:29:15,995
1,35,male,1,own,little,,2684,24,radio/TV,0,2007-12-30 00:40:33,256
2,23,female,2,rent,little,little,2406,30,furniture/equipment,1,2007-12-29 20:14:30,359
3,36,male,3,rent,,moderate,7057,20,car,0,2007-12-29 16:10:08,99
4,30,male,2,own,little,moderate,639,12,repairs,1,2007-12-28 17:09:43,127
...,...,...,...,...,...,...,...,...,...,...,...,...
568,31,male,2,own,little,,2775,18,car,1,2007-05-01 23:14:02,598
569,39,female,1,own,,moderate,932,6,education,0,2007-05-01 20:28:44,215
570,30,female,3,own,little,moderate,4795,36,radio/TV,0,2007-05-01 12:42:45,141
571,49,female,2,own,little,moderate,1092,12,radio/TV,0,2007-05-01 08:41:46,288


In [32]:
sql = '''
SELECT * 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
AND gc.purpose = 'car'
ORDER BY gc.contract_dt DESC, credit_amount
'''

In [33]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,36,male,3,rent,,moderate,7057,20,car,0,2007-12-29 16:10:08,99
1,25,male,2,rent,moderate,moderate,1264,15,car,1,2007-12-28 08:38:58,979
2,48,male,2,own,little,,2134,9,car,0,2007-12-24 16:28:30,20
3,43,male,2,rent,little,little,4843,12,car,1,2007-12-22 16:17:29,668
4,37,male,2,free,,moderate,12389,36,car,1,2007-12-21 00:06:27,563
...,...,...,...,...,...,...,...,...,...,...,...,...
177,75,male,3,free,little,little,6615,24,car,0,2007-05-06 09:37:45,330
178,55,male,2,own,rich,,1413,12,car,0,2007-05-05 06:35:11,209
179,47,male,3,own,little,moderate,1209,6,car,1,2007-05-03 10:29:01,485
180,36,male,2,own,little,moderate,884,18,car,1,2007-05-02 06:22:11,184


In [34]:
sql = '''
SELECT count(*) 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
AND gc.purpose = 'car'
ORDER BY gc.contract_dt DESC, credit_amount
'''

In [35]:
select(sql)

Unnamed: 0,count(*)
0,182


In [36]:
sql = '''
SELECT * 
FROM german_credit AS gc 
WHERE gc.contract_dt BETWEEN '2007-01-01' AND '2007-12-31'
AND gc.purpose IN ('car', 'repairs')
ORDER BY gc.contract_dt DESC, credit_amount
'''

In [37]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,36,male,3,rent,,moderate,7057,20,car,0,2007-12-29 16:10:08,99
1,30,male,2,own,little,moderate,639,12,repairs,1,2007-12-28 17:09:43,127
2,25,male,2,rent,moderate,moderate,1264,15,car,1,2007-12-28 08:38:58,979
3,48,male,2,own,little,,2134,9,car,0,2007-12-24 16:28:30,20
4,67,female,2,own,little,moderate,3872,18,repairs,0,2007-12-24 12:25:28,779
...,...,...,...,...,...,...,...,...,...,...,...,...
192,55,male,2,own,rich,,1413,12,car,0,2007-05-05 06:35:11,209
193,55,female,0,free,little,little,1190,18,repairs,1,2007-05-05 00:14:17,429
194,47,male,3,own,little,moderate,1209,6,car,1,2007-05-03 10:29:01,485
195,36,male,2,own,little,moderate,884,18,car,1,2007-05-02 06:22:11,184


## LIKE

In [36]:
t = pd.DataFrame({'purpose':['машина','на машину','на покупку машины','автомобиль','на возвращение 2007'],
              'amount':[1000,400,600,700,1500]})

In [37]:
t

Unnamed: 0,purpose,amount
0,машина,1000
1,на машину,400
2,на покупку машины,600
3,автомобиль,700
4,на возвращение 2007,1500


In [38]:
t.to_sql('purpose', con, index=False, if_exists='replace')

5

In [39]:
sql = '''SELECT * 
FROM purpose AS t
'''

In [40]:
select(sql)

Unnamed: 0,purpose,amount
0,машина,1000
1,на машину,400
2,на покупку машины,600
3,автомобиль,700
4,на возвращение 2007,1500


In [41]:
sql = '''
SELECT * from purpose t
WHERE (t.purpose like '%машин%'
OR t.purpose LIKE '%авто%') AND t.amount > 500
'''

In [42]:
select(sql)

Unnamed: 0,purpose,amount
0,машина,1000
1,на покупку машины,600
2,автомобиль,700


In [43]:
sql = '''
SELECT count(*)
FROM german_credit gc
'''

In [44]:
select(sql)

Unnamed: 0,count(*)
0,1000


In [45]:
sql = '''
SELECT count(*) FROM german_credit gc
WHERE gc.credit_amount > 1000
'''

In [46]:
select(sql)

Unnamed: 0,count(*)
0,884


In [47]:
884/1000

0.884

In [38]:
sql = '''
SELECT gc.credit_amount,
CASE WHEN gc.credit_amount > 1000 THEN 1 ELSE 0 END AS greater_1000_flag,

IIF(gc.credit_amount > 1000,1,0) AS greater_1000_flag2

FROM german_credit AS gc
'''

In [39]:
select(sql)

Unnamed: 0,credit_amount,greater_1000_flag,greater_1000_flag2
0,3074,1,1
1,1344,1,1
2,936,0,0
3,1393,1,1
4,776,0,0
...,...,...,...
995,2600,1,1
996,4455,1,1
997,6403,1,1
998,5003,1,1


In [40]:
sql = '''select 
avg(case when t.credit_amount > 1000 then 1 else 0 end) as greater_1000_frac
 from german_credit t'''
     

In [41]:
select(sql)

Unnamed: 0,greater_1000_frac
0,0.884


# Create Table

In [42]:
sql = '''
SELECT * 
FROM german_credit AS gc
WHERE gc.credit_amount > 1000
'''    

In [43]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
3,27,female,2,own,little,moderate,1295,18,furniture/equipment,0,2008-06-18 04:10:05,86
4,26,male,2,own,little,little,4370,42,radio/TV,1,2007-11-29 00:20:44,639
...,...,...,...,...,...,...,...,...,...,...,...,...
879,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
880,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
881,33,male,2,own,little,moderate,6403,24,radio/TV,0,2008-04-08 03:24:26,730
882,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


In [44]:
cur = con.cursor() 

In [45]:
sql = '''
DROP TABLE IF EXISTS greater_1000_credit;

CREATE TABLE greater_1000_credit AS
  SELECT *
  FROM   german_credit AS gc
  WHERE  gc.credit_amount > 1000  
'''
     


In [46]:
cur.executescript(sql)

<sqlite3.Cursor at 0x7f0de847dcc0>

In [47]:
sql = '''
SELECT *
FROM   greater_1000_credit gc  
'''

In [48]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
3,27,female,2,own,little,moderate,1295,18,furniture/equipment,0,2008-06-18 04:10:05,86
4,26,male,2,own,little,little,4370,42,radio/TV,1,2007-11-29 00:20:44,639
...,...,...,...,...,...,...,...,...,...,...,...,...
879,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
880,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
881,33,male,2,own,little,moderate,6403,24,radio/TV,0,2008-04-08 03:24:26,730
882,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


# Union all

In [49]:
jan = pd.DataFrame({'month':['jan','jan'],'revenue':[1,2]})
feb = pd.DataFrame({'month':['feb','feb'],'revenue':[1,2]})  

In [50]:
jan.to_sql('jan',con,index=False,if_exists='replace')
feb.to_sql('feb',con,index=False,if_exists='replace')

2

In [51]:
sql = '''
SELECT * 
FROM jan gc
UNION ALL
SELECT * FROM feb gc'''     

In [52]:
select(sql)

Unnamed: 0,month,revenue
0,jan,1
1,jan,2
2,feb,1
3,feb,2


In [53]:
sql = '''
SELECT gc.revenue FROM jan gc
UNION
SELECT gc.revenue from feb gc
'''

In [54]:
select(sql)

Unnamed: 0,revenue
0,1
1,2


In [55]:
sql = '''
SELECT gc.revenue,
'jan' AS month
FROM jan gc
UNION ALL
SELECT gc.revenue,
'mar' as month
 FROM feb gc
'''

In [56]:
select(sql)

Unnamed: 0,revenue,month
0,1,jan
1,2,jan
2,1,mar
3,2,mar


# Специальные функции


In [57]:
sql = '''
SELECT gc.sex, 
    substr(gc.sex,1,1)
FROM german_credit AS gc
'''

In [58]:
select(sql)

Unnamed: 0,sex,"substr(gc.sex,1,1)"
0,male,m
1,male,m
2,male,m
3,female,f
4,male,m
...,...,...
995,male,m
996,male,m
997,male,m
998,female,f


# Group by

##  Сводная таблица


In [92]:
sql = '''
SELECT gc.sex,
    count(*) AS count,
    avg(gc.credit_amount) AS credit_amount_avg

FROM german_credit AS gc
GROUP BY gc.sex
'''

In [93]:
select(sql)

Unnamed: 0,sex,count,credit_amount_avg
0,female,310,2877.774194
1,male,690,3448.04058


In [61]:
df.groupby('sex')['credit_amount'].agg(['count','mean'])

Unnamed: 0_level_0,count,mean
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,310,2877.774194
male,690,3448.04058


In [62]:
sql = '''select count(distinct t.housing), count(t.housing) from german_credit t'''

In [63]:
select(sql)

Unnamed: 0,count(distinct t.housing),count(t.housing)
0,3,1000


In [64]:
sql = '''
SELECT gc.housing,

count(*) AS cnt,
avg(gc.credit_amount) AS credit_amount_avg

FROM german_credit AS gc
GROUP BY gc.housing
'''
     


In [65]:
select(sql)

Unnamed: 0,housing,cnt,credit_amount_avg
0,free,108,4906.212963
1,own,713,3060.939691
2,rent,179,3122.553073


## Пропуски

In [66]:
sql = '''
SELECT count(gc.checking_account),
count(0)
FROM german_credit AS gc
'''

In [67]:
select(sql)

Unnamed: 0,count(gc.checking_account),count(0)
0,606,1000


In [68]:
sql = '''
SELECT gc.checking_account,

count(*) as cnt,
avg(gc.credit_amount) AS credit_amount_avg

FROM german_credit gc
GROUP by gc.checking_account
'''

In [69]:
select(sql)

Unnamed: 0,checking_account,cnt,credit_amount_avg
0,,394,3133.101523
1,little,274,3175.218978
2,moderate,269,3827.561338
3,rich,63,2177.650794


In [70]:
df.groupby('checking_account',dropna=False)['credit_amount'].count()

checking_account
little      274
moderate    269
rich         63
NaN         394
Name: credit_amount, dtype: int64

In [71]:
sql = '''
SELECT sum(CASE 
                WHEN gc.checking_account IS NULL THEN 1
                ELSE 0
                END) AS is_null,

        count(CASE
            WHEN gc.checking_account IS NULL THEN 1
            ELSE NULL
        END) AS is_null2

FROM german_credit AS gc
'''

In [72]:
select(sql)

Unnamed: 0,is_null,is_null2
0,394,394


In [73]:
t = pd.DataFrame({'col1':[1,np.nan,2]})

In [74]:
t

Unnamed: 0,col1
0,1.0
1,
2,2.0


In [75]:
t.to_sql('null_test',con,index=False,if_exists='replace')

3

In [76]:
t

Unnamed: 0,col1
0,1.0
1,
2,2.0


In [77]:
(1+2)/3

1.0

In [78]:
(1 + 0 + 2) / 3

1.0

In [79]:
sql = '''
SELECT avg(t.col1)
FROM null_test AS t
'''

In [80]:
select(sql)

Unnamed: 0,avg(t.col1)
0,1.5


In [88]:
sql = '''
SELECT gc.checking_account,
    coalesce(gc.checking_account,'no_info')

FROM german_credit AS gc
'''

In [89]:
select(sql)

Unnamed: 0,checking_account,"coalesce(gc.checking_account,'no_info')"
0,,no_info
1,little,little
2,,no_info
3,,no_info
4,,no_info
...,...,...
995,little,little
996,moderate,moderate
997,moderate,moderate
998,,no_info


In [171]:
sql = '''
SELECT avg(coalesce(t.col1,0))
FROM null_test AS t
'''

In [172]:
select(sql)

Unnamed: 0,"avg(coalesce(t.col1,0))"
0,1.0


In [173]:
t = pd.DataFrame({'col1':[1,np.nan,2],
                  'col2':[np.nan,np.nan,1],
                  'col3':[1,2,3]})

In [174]:
t

Unnamed: 0,col1,col2,col3
0,1.0,,1
1,,,2
2,2.0,1.0,3


In [175]:
t.to_sql('null_test',con,index=False,if_exists='replace')

3

In [177]:
sql = '''
SELECT t.*,
    coalesce(t.col1,t.col2,t.col3)
FROM null_test t
'''

In [178]:
select(sql)

Unnamed: 0,col1,col2,col3,"coalesce(t.col1,t.col2,t.col3)"
0,1.0,,1,1.0
1,,,2,2.0
2,2.0,1.0,3,2.0


##  Дубликаты


In [179]:
t = pd.DataFrame({'id':[1,1,2],'name':['a','a','b']})

In [180]:
t

Unnamed: 0,id,name
0,1,a
1,1,a
2,2,b


In [181]:
t.to_sql('dupl_test',con,index=False,if_exists='replace')

3

In [182]:
sql = '''
SELECT * 
FROM dupl_test AS t
'''

In [183]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,a
2,2,b


In [184]:
sql = '''
SELECT t.id, t.name, 
    count(1) AS cnt
FROM dupl_test t
GROUP BY t.id, t.name
''' 

In [185]:
select(sql)

Unnamed: 0,id,name,cnt
0,1,a,2
1,2,b,1


In [186]:
sql = '''
SELECT t.id, t.name,
    count(1) AS cnt from dupl_test t
GROUP BY t.id, t.name
HAVING count(1) > 1
'''
     


In [187]:
select(sql)

Unnamed: 0,id,name,cnt
0,1,a,2


In [188]:
t = pd.DataFrame({'id':[1,1,2,2,3],
                  'name':['a','b','c','d','e']})

In [189]:
t

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [190]:
t.to_sql('dupl_test',con,index=False,if_exists='replace')

5

In [191]:
sql = '''
SELECT *
FROM dupl_test t
'''

In [192]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [193]:
sql = '''
SELECT t.id,
count(1) AS cnt from dupl_test t
GROUP BY t.id
HAVING count(1) > 1
'''
     


In [194]:
select(sql)

Unnamed: 0,id,cnt
0,1,2
1,2,2


In [195]:
sql = '''
SELECT *
FROM dupl_test t
WHERE t.id IN (1,2)
'''

In [196]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


In [197]:
sql = '''
SELECT t.id as cnt from dupl_test t
GROUP BY t.id
HAVING count(1) > 1
'''

In [198]:
select(sql)

Unnamed: 0,cnt
0,1
1,2


In [202]:
sql = '''
SELECT *
FROM dupl_test t
WHERE t.id IN (SELECT t.id AS cnt from dupl_test t
GROUP BY t.id
HAVING count(1) > 1)
'''

In [203]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


# Агрегация


In [206]:
sql = '''
SELECT date(gc.contract_dt,'start of month') AS month,
    count(1) AS credit_cnt,
    count(distinct gc.client_id) AS client_id_unique,
    sum(gc.credit_amount) AS credit_amount_sum,
    avg(gc.credit_amount) AS credit_amount_avg

FROM german_credit AS gc
GROUP BY date(gc.contract_dt,'start of month')
ORDER BY date(gc.contract_dt,'start of month')
'''

In [207]:
select(sql)

Unnamed: 0,month,credit_cnt,client_id_unique,credit_amount_sum,credit_amount_avg
0,2007-05-01,81,81,207663,2563.740741
1,2007-06-01,74,74,239594,3237.756757
2,2007-07-01,71,71,224333,3159.619718
3,2007-08-01,57,57,178569,3132.789474
4,2007-09-01,58,58,186909,3222.568966
5,2007-10-01,70,70,188534,2693.342857
6,2007-11-01,87,87,300504,3454.068966
7,2007-12-01,77,77,273973,3558.090909
8,2008-01-01,93,93,288080,3097.634409
9,2008-02-01,55,55,211128,3838.690909


In [208]:
pd.Series(range(600)).sample(1000,replace=True)

567    567
452    452
457    457
587    587
37      37
      ... 
544    544
530    530
125    125
242    242
34      34
Length: 1000, dtype: int64

In [209]:
df['client_id_2'] = pd.Series(range(600)).sample(1000,replace=True).values

In [210]:
df.to_sql('german_credit_not_unieque',con,index=False,if_exists='replace')

1000

In [211]:
sql = '''
SELECT 
    date(gc.contract_dt,'start of month') as month,
    count(1) AS credit_cnt,
    count(distinct gc.client_id_2) AS client_id_unique,
    sum(gc.credit_amount) AS credit_amount_sum,
    avg(gc.credit_amount) AS credit_amount_avg

FROM german_credit_not_unieque AS gc

GROUP BY DATE(gc.contract_dt,'start of month')
ORDER BY DATE(gc.contract_dt,'start of month')
'''
     


In [212]:
select(sql)

Unnamed: 0,month,credit_cnt,client_id_unique,credit_amount_sum,credit_amount_avg
0,2007-05-01,81,79,207663,2563.740741
1,2007-06-01,74,69,239594,3237.756757
2,2007-07-01,71,64,224333,3159.619718
3,2007-08-01,57,52,178569,3132.789474
4,2007-09-01,58,57,186909,3222.568966
5,2007-10-01,70,64,188534,2693.342857
6,2007-11-01,87,82,300504,3454.068966
7,2007-12-01,77,71,273973,3558.090909
8,2008-01-01,93,85,288080,3097.634409
9,2008-02-01,55,51,211128,3838.690909


# Интервалы (или бины или бакеты)


In [213]:
sql = '''
SELECT count(distinct gc.credit_amount)
FROM german_credit AS gc
'''

In [214]:
select(sql)

Unnamed: 0,count(distinct gc.credit_amount)
0,921


In [221]:
sql = '''
SELECT gc.credit_amount,
    CASE 
        WHEN gc.credit_amount < 1000 then '1. <1000'
        when gc.credit_amount < 2000 then '2. 1000-2000' 
        when gc.credit_amount < 3000 then '3. 2000-3000'
        when gc.credit_amount >= 3000 then '4. >= 3000'
    ELSE 'other' end AS credit_amount_bin

FROM german_credit AS gc
'''

In [222]:
select(sql)

Unnamed: 0,credit_amount,credit_amount_bin
0,3074,4. >= 3000
1,1344,2. 1000-2000
2,936,1. <1000
3,1393,2. 1000-2000
4,776,1. <1000
...,...,...
995,2600,3. 2000-3000
996,4455,4. >= 3000
997,6403,4. >= 3000
998,5003,4. >= 3000


In [227]:
sql = '''
SELECT
    CASE 
        WHEN gc.credit_amount < 1000 THEN '1. <1000'
        WHEN gc.credit_amount < 2000 then '2. 1000-2000' 
        WHEN gc.credit_amount < 3000 then '3. 2000-3000'
        WHEN gc.credit_amount >= 3000 then '4. >= 3000'
    ELSE 'other' end AS credit_amount_bin,

count(1) AS credit_cnt

FROM german_credit AS gc
GROUP BY 1
'''

In [228]:
select(sql)

Unnamed: 0,credit_amount_bin,credit_cnt
0,1. <1000,116
1,2. 1000-2000,316
2,3. 2000-3000,188
3,4. >= 3000,380


# Столбцы сводной таблицы


In [229]:
sql = '''
SELECT gc.housing, 

count(CASE WHEN gc.sex = 'female' THEN 1 ELSE NULL end) as female,
count(CASE WHEN gc.sex = 'male' THEN 1 ELSE NULL end) as male,

count(1) AS cnt 

FROM german_credit gc
GROUP BY gc.housing
'''

In [230]:
select(sql)

Unnamed: 0,housing,female,male,cnt
0,free,19,89,108
1,own,196,517,713
2,rent,95,84,179


In [234]:
sql = '''
SELECT DISTINCT gc.purpose
FROM german_credit AS gc
'''

In [235]:
purpose = list(select(sql)['purpose'].values)

In [236]:
purpose

['radio/TV',
 'car',
 'education',
 'furniture/equipment',
 'repairs',
 'business',
 'domestic appliances',
 'vacation/others']

In [237]:
for p in purpose:
  print(f"count(case when t.purpose = '{p}' then 1 else null end) as {p.lower().replace(' ','').replace('/','')},")

count(case when t.purpose = 'radio/TV' then 1 else null end) as radiotv,
count(case when t.purpose = 'car' then 1 else null end) as car,
count(case when t.purpose = 'education' then 1 else null end) as education,
count(case when t.purpose = 'furniture/equipment' then 1 else null end) as furnitureequipment,
count(case when t.purpose = 'repairs' then 1 else null end) as repairs,
count(case when t.purpose = 'business' then 1 else null end) as business,
count(case when t.purpose = 'domestic appliances' then 1 else null end) as domesticappliances,
count(case when t.purpose = 'vacation/others' then 1 else null end) as vacationothers,


In [238]:
sql = '''
SELECT gc.housing,
       Count(CASE WHEN gc.purpose = 'radio/TV' THEN 1 ELSE NULL end) AS radiotv,
       Count(CASE WHEN gc.purpose = 'car' THEN 1 ELSE NULL end) AS car,
       Count(CASE WHEN gc.purpose = 'education' THEN 1 ELSE NULL end) AS education,
       Count(CASE WHEN gc.purpose = 'furniture/equipment' THEN 1 ELSE NULL end) AS furnitureequipment,
       Count(CASE WHEN gc.purpose = 'repairs' THEN 1 ELSE NULL end) AS repairs,
       Count(CASE WHEN gc.purpose = 'business' THEN 1 ELSE NULL end) AS business,
       Count(CASE WHEN gc.purpose = 'domestic appliances' THEN 1 ELSE NULL end) AS domesticappliances,
       Count(CASE WHEN gc.purpose = 'vacation/others' THEN 1 ELSE NULL end) AS vacationothers,
       Count(1)   AS cnt
FROM   german_credit AS gc
GROUP  BY gc.housing  
'''

In [239]:
select(sql)

Unnamed: 0,housing,radiotv,car,education,furnitureequipment,repairs,business,domesticappliances,vacationothers,cnt
0,free,15,55,15,11,3,5,0,4,108
1,own,227,219,34,122,17,76,10,8,713
2,rent,38,63,10,48,2,16,2,0,179


#  Категории из текстовых данных


In [240]:
t = pd.DataFrame({'purpose':['машина','машина','машина','на машину','на покупку машины',
                             'автомобиль','на возвращение 2007', 
                             'на свадьбу','свадьба','свадьба','свадьба','для свадьбы',
                             'недвижимость','на покупку недвижимости']})

In [241]:
t

Unnamed: 0,purpose
0,машина
1,машина
2,машина
3,на машину
4,на покупку машины
5,автомобиль
6,на возвращение 2007
7,на свадьбу
8,свадьба
9,свадьба


In [242]:
t.to_sql('purpose',con,index=False,if_exists='replace')

14

In [244]:
df.head(2)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id,client_id_2
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210,335
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929,382


In [245]:
sql = '''
SELECT *
FROM purpose t
'''

In [246]:
select(sql)

Unnamed: 0,purpose
0,машина
1,машина
2,машина
3,на машину
4,на покупку машины
5,автомобиль
6,на возвращение 2007
7,на свадьбу
8,свадьба
9,свадьба


In [248]:
sql = '''
SELECT t.purpose, count(1) 
FROM purpose t
GROUP BY t.purpose
ORDER BY count(1) DESC
'''
     


In [249]:
select(sql)

Unnamed: 0,purpose,count(1)
0,свадьба,3
1,машина,3
2,недвижимость,1
3,на свадьбу,1
4,на покупку недвижимости,1
5,на покупку машины,1
6,на машину,1
7,на возвращение 2007,1
8,для свадьбы,1
9,автомобиль,1


In [254]:
cat = '''SELECT t.purpose,

CASE WHEN t.purpose like '%свадьб%' THEN 'свадьба'
WHEN t.purpose LIKE '%машин%' OR t.purpose LIKE '%авто%' THEN 'машина'
WHEN t.purpose LIKE '%недвиж%' THEN 'недвижимость'

ELSE 'другое' end AS purpose_cat

FROM purpose t
'''

In [255]:
select(sql)

Unnamed: 0,purpose,count(1)
0,свадьба,3
1,машина,3
2,недвижимость,1
3,на свадьбу,1
4,на покупку недвижимости,1
5,на покупку машины,1
6,на машину,1
7,на возвращение 2007,1
8,для свадьбы,1
9,автомобиль,1


In [256]:
sql = f'''SELECT 
t.purpose_cat,

count(1)

FROM({cat}) t
GROUP BY t.purpose_cat
'''

In [257]:
select(sql)

Unnamed: 0,purpose_cat,count(1)
0,другое,1
1,машина,6
2,недвижимость,2
3,свадьба,5


In [260]:
sql = f'''
SELECT t.purpose,
    count(1) 

FROM ({cat}) t
WHERE t.purpose_cat = 'другое'


GROUP BY t.purpose

ORDER BY count(1) DESC
'''
     


In [261]:
select(sql)

Unnamed: 0,purpose,count(1)
0,на возвращение 2007,1


# Подзапросы


## Обычный подзапрос


In [262]:
t = pd.DataFrame({'id':[1,1,2,2,3],
                  'name':['a','b','c','d','e']})
     

In [263]:
t

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [264]:
t.to_sql('dupl_test',con,index=False,if_exists='replace')

5

In [266]:
sql = '''
SELECT *
FROM dupl_test t
'''

In [267]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [269]:
sql = '''
SELECT t.id
FROM dupl_test t
GROUP BY t.id
HAVING count(1) > 1
'''

In [270]:
select(sql)

Unnamed: 0,id
0,1
1,2


In [271]:
sql = '''
SELECT * 
FROM dupl_test t
WHERE t.id IN (select t.id from dupl_test t
GROUP BY t.id
HAVING count(1) > 1)
'''

In [272]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


In [273]:
sql = '''
DROP TABLE IF exists dupls;
CREATE TABLE dupls AS
SELECT t.id FROM dupl_test t
GROUP BY t.id
HAVING count(1) > 1
'''

In [274]:
cur.executescript(sql)

<sqlite3.Cursor at 0x7f8802f61140>

In [275]:
sql = '''
SELECT * 
FROM dupls t
'''

In [276]:
select(sql)

Unnamed: 0,id
0,1
1,2


In [278]:
sql = '''
SELECT * 
FROM dupl_test t
WHERE t.id IN dupls
'''

In [279]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


In [282]:
sql = '''
SELECT * 
FROM 
    (SELECT t.id,count(1) AS cnt FROM dupl_test t
    GROUP BY t.id) t

WHERE t.cnt > 1
'''
     


In [283]:
select(sql)

Unnamed: 0,id,cnt
0,1,2
1,2,2


# CTE (with)


In [284]:
sql = '''
SELECT *
FROM   
        (SELECT *
        FROM   (SELECT t.id,
                       Count(1) AS cnt
                FROM   dupl_test t
                GROUP  BY t.id) t
        WHERE  t.cnt > 1) t
WHERE  t.id = 1 
'''
     


In [285]:
select(sql)

Unnamed: 0,id,cnt
0,1,2


In [288]:
sql = '''
WITH id_cnt
     AS (SELECT t.id,
                count(1) AS cnt
         FROM   dupl_test t
         GROUP  BY t.id),
     id_cnt_2
     AS (SELECT *
         FROM   id_cnt t
         WHERE  t.cnt > 1)
SELECT *
FROM   id_cnt_2 t
WHERE  t.id = 1  
'''

In [289]:
select(sql)

Unnamed: 0,id,cnt
0,1,2


In [292]:
sql = f'''
SELECT t.purpose_cat,
    count(1)

FROM ({cat}) t
GROUP BY t.purpose_cat
'''

In [293]:
print(sql)


SELECT t.purpose_cat,
    count(1)

FROM (SELECT t.purpose,

CASE WHEN t.purpose like '%свадьб%' THEN 'свадьба'
WHEN t.purpose LIKE '%машин%' OR t.purpose LIKE '%авто%' THEN 'машина'
WHEN t.purpose LIKE '%недвиж%' THEN 'недвижимость'

ELSE 'другое' end AS purpose_cat

FROM purpose t
) t
GROUP BY t.purpose_cat



In [294]:
select(sql)

Unnamed: 0,purpose_cat,count(1)
0,другое,1
1,машина,6
2,недвижимость,2
3,свадьба,5


In [295]:
sql = '''
WITH categories
     AS (SELECT t.purpose,
                CASE
                  WHEN t.purpose LIKE '%свадьб%' THEN 'свадьба'
                  WHEN t.purpose LIKE '%машин%'
                        OR t.purpose LIKE '%авто%' THEN 'машина'
                  WHEN t.purpose LIKE '%недвиж%' THEN
                  'недвижимость'
                  ELSE 'другое'
                END AS purpose_cat
         FROM   purpose t)
SELECT t.purpose_cat,
       count(1)
FROM   categories t
GROUP  BY t.purpose_cat 
'''

In [296]:
select(sql)

Unnamed: 0,purpose_cat,count(1)
0,другое,1
1,машина,6
2,недвижимость,2
3,свадьба,5


# Когда лучше создавать таблицы


In [297]:
sql = '''
DROP TABLE IF EXISTS categories;

CREATE TABLE categories AS
  SELECT t.purpose,
         CASE
           WHEN t.purpose LIKE '%свадьб%' THEN 'свадьба'
           WHEN t.purpose LIKE '%машин%'
                 OR t.purpose LIKE '%авто%' THEN 'машина'
           WHEN t.purpose LIKE '%недвиж%' THEN 'недвижимость'
           ELSE 'другое'
         end AS purpose_cat
FROM   purpose t  
'''
     


In [298]:
cur.executescript(sql)

<sqlite3.Cursor at 0x7f8802f61140>

In [299]:
sql = '''
SELECT *
FROM categories t
'''

In [300]:
select(sql)

Unnamed: 0,purpose,purpose_cat
0,машина,машина
1,машина,машина
2,машина,машина
3,на машину,машина
4,на покупку машины,машина
5,автомобиль,машина
6,на возвращение 2007,другое
7,на свадьбу,свадьба
8,свадьба,свадьба
9,свадьба,свадьба


In [301]:
sql = f'''
SELECT t.purpose_cat,
    count(1)
FROM categories t
GROUP BY t.purpose_cat
'''

In [302]:
select(sql)

Unnamed: 0,purpose_cat,count(1)
0,другое,1
1,машина,6
2,недвижимость,2
3,свадьба,5


In [303]:
sql = f'''
SELECT t.purpose, count(1) 

FROM categories t
WHERE t.purpose_cat = 'другое'


GROUP BY t.purpose

ORDER BY count(1) DESC
'''

In [304]:
select(sql)

Unnamed: 0,purpose,count(1)
0,на возвращение 2007,1


In [305]:
sql = '''
WITH categories
     AS (SELECT 1
         FROM   purpose t)
SELECT *
FROM   categories t  
'''
     


In [306]:
select(sql)

Unnamed: 0,1
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


# Джойны

## Лефт и иннер джоин


In [307]:
sql = '''
SELECT *
FROM german_credit t limit 5
'''

In [308]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358


In [309]:
users = pd.DataFrame({'id':[1,2,3],'name':['gleb','jon snow','tyrion']})

In [310]:
users

Unnamed: 0,id,name
0,1,gleb
1,2,jon snow
2,3,tyrion


In [311]:
items = pd.DataFrame({'user_id':[1,3,3],'item_name':['hleb','gold','wine'],'value':[5,100,20]})

In [312]:
items

Unnamed: 0,user_id,item_name,value
0,1,hleb,5
1,3,gold,100
2,3,wine,20


In [313]:
users.to_sql('users',con,index=False,if_exists='replace')
items.to_sql('items',con,index=False,if_exists='replace')

3

In [314]:
sql = '''
SELECT t.*, i.item_name, i.value, i.user_id
FROM users t
LEFT JOIN items i ON t.id = i.user_id
'''

In [315]:
select(sql)

Unnamed: 0,id,name,item_name,value,user_id
0,1,gleb,hleb,5.0,1.0
1,2,jon snow,,,
2,3,tyrion,gold,100.0,3.0
3,3,tyrion,wine,20.0,3.0


In [316]:
sql = '''
SELECT t.*, i.item_name 
FROM users t
JOIN items i ON t.id = i.user_id
'''
     


In [317]:
select(sql)

Unnamed: 0,id,name,item_name
0,1,gleb,hleb
1,3,tyrion,gold
2,3,tyrion,wine


In [318]:
sql = '''
SELECT t.*, i.item_name
FROM users t
LEFT JOIN items i on t.id = i.user_id
WHERE i.item_name IS NOT NULL
'''
     


In [319]:
select(sql)

Unnamed: 0,id,name,item_name
0,1,gleb,hleb
1,3,tyrion,gold
2,3,tyrion,wine


# Агрегируй перед тем как джойнить


In [320]:
users = pd.DataFrame({'id':[1,2,3],'name':['gleb','jon snow','tyrion'],
                      'victory':[2,10,1]})

In [321]:
users.to_sql('users',con,index=False,if_exists='replace')

3

In [323]:
sql = '''
SELECT t.*, i.item_name, i.value, i.user_id 
FROM users t
LEFT JOIN items i on t.id = i.user_id
'''
     


In [324]:
t = select(sql)

In [325]:
t

Unnamed: 0,id,name,victory,item_name,value,user_id
0,1,gleb,2,hleb,5.0,1.0
1,2,jon snow,10,,,
2,3,tyrion,1,gold,100.0,3.0
3,3,tyrion,1,wine,20.0,3.0


In [326]:
t['victory'].sum()

14

In [327]:
sql = '''
SELECT sum(t.victory)
FROM users t'''

In [328]:
select(sql)

Unnamed: 0,sum(t.victory)
0,13


In [491]:
sql = '''
SELECT t.*, i.item_name, i.value, i.user_id
FROM users t
JOIN items i on t.id = i.user_id
'''

In [492]:
select(sql)

Unnamed: 0,id,name,victory,item_name,value,user_id
0,1,gleb,2,hleb,5,1
1,3,tyrion,1,gold,100,3
2,3,tyrion,1,wine,20,3


In [329]:
sql = '''
SELECT t.id, t.name, t.victory,
    count(i.item_name) as item_cnt,
    coalesce(sum(i.value),0) as value_sum

FROM users t
LEFT JOIN items i on t.id = i.user_id
GROUP BY t.id, t.name, t.victory
'''
     


In [330]:
select(sql)

Unnamed: 0,id,name,victory,item_cnt,value_sum
0,1,gleb,2,1,5
1,2,jon snow,10,0,0
2,3,tyrion,1,2,120


In [331]:
sql = '''
SELECT t.user_id, 
    count(t.item_name) as item_cnt,
    sum(value) as value_sum 
FROM items t
GROUP BY t.user_id
'''

In [332]:
select(sql)

Unnamed: 0,user_id,item_cnt,value_sum
0,1,1,5
1,3,2,120


In [333]:
sql = '''
WITH items_agg AS (
    SELECT t.user_id,
        count(t.item_name) AS item_cnt,
        sum(value) AS value_sum
    FROM items t
    GROUP BY t.user_id)
SELECT t.id, t.name, t.victory,
    coalesce(i.item_cnt,0) AS item_cnt,
    coalesce(i.value_sum,0) AS value_sum

FROM users t

LEFT JOIN items_agg i ON t.id = i.user_id
'''
     


In [334]:
select(sql)

Unnamed: 0,id,name,victory,item_cnt,value_sum
0,1,gleb,2,1,5
1,2,jon snow,10,0,0
2,3,tyrion,1,2,120


# Как не надо писать джойны


In [335]:
sql = '''
WITH items_agg AS (
    SELECT t.user_id, count(t.item_name) AS item_cnt,
        sum(value) AS value_sum from items t
GROUP BY t.user_id)
SELECT t.id, t.name, t.victory,
    coalesce(item_cnt,0) as item_cnt,
    coalesce(value_sum,0) as value_sum

FROM users t

LEFT JOIN items_agg i on t.id = i.user_id
'''

In [336]:
select(sql)

Unnamed: 0,id,name,victory,item_cnt,value_sum
0,1,gleb,2,1,5
1,2,jon snow,10,0,0
2,3,tyrion,1,2,120


In [337]:
items = pd.DataFrame({'id':[1,3,3],'item_name':['hleb','gold','wine'],'value':[5,100,20]})

In [338]:
items.to_sql('items2',con,index=False,if_exists='replace')

3

In [340]:
sql = '''
SELECT *
FROM users t
LEFT JOIN items i on t.id = i.user_id
'''

In [341]:
select(sql)

Unnamed: 0,id,name,victory,user_id,item_name,value
0,1,gleb,2,1.0,hleb,5.0
1,2,jon snow,10,,,
2,3,tyrion,1,3.0,gold,100.0
3,3,tyrion,1,3.0,wine,20.0


In [342]:
sql = '''
SELECT *
FROM users t
LEFT JOIN items2 i ON t.id=i.id
'''

In [343]:
select(sql)

Unnamed: 0,id,name,victory,id.1,item_name,value
0,1,gleb,2,1.0,hleb,5.0
1,2,jon snow,10,,,
2,3,tyrion,1,3.0,gold,100.0
3,3,tyrion,1,3.0,wine,20.0


In [344]:
sql = '''
SELECT *
FROM users t, items2 i 
WHERE t.id = i.id
'''
     


In [345]:
select(sql)

Unnamed: 0,id,name,victory,id.1,item_name,value
0,1,gleb,2,1,hleb,5
1,3,tyrion,1,3,gold,100
2,3,tyrion,1,3,wine,20


# Никогда не использовать right join!


In [346]:
users = pd.DataFrame({'id':[1,2,3],'name':['gleb','jon snow','tyrion']})

In [347]:
users

Unnamed: 0,id,name
0,1,gleb
1,2,jon snow
2,3,tyrion


In [348]:
items = pd.DataFrame({'user_id':[1,3,3,4],'item_name':['hleb','gold','wine','sword'],'value':[5,100,20,50]})

In [349]:
items

Unnamed: 0,user_id,item_name,value
0,1,hleb,5
1,3,gold,100
2,3,wine,20
3,4,sword,50


In [351]:
users.to_sql('users',con,index=False,if_exists='replace')
items.to_sql('items',con,index=False,if_exists='replace')

4

In [353]:
sql = '''
SELECT t.*, i.*
FROM users t
LEFT JOIN items i on t.id = i.user_id
'''

In [354]:
select(sql)

Unnamed: 0,id,name,user_id,item_name,value
0,1,gleb,1.0,hleb,5.0
1,2,jon snow,,,
2,3,tyrion,3.0,gold,100.0
3,3,tyrion,3.0,wine,20.0


In [355]:
sql = '''
SELECT t.*, u.* 
FROM items t 
LEFT JOIN users u ON t.user_id = u.id
'''

In [356]:
select(sql)

Unnamed: 0,user_id,item_name,value,id,name
0,1,hleb,5,1.0,gleb
1,3,gold,100,3.0,tyrion
2,3,wine,20,3.0,tyrion
3,4,sword,50,,


In [357]:
sql = '''
SELECT t.*, i.*
FROM users t
RIGHT JOIN items i on t.id = i.user_id
'''

In [358]:
select(sql)

Unnamed: 0,id,name,user_id,item_name,value
0,1.0,gleb,1,hleb,5
1,3.0,tyrion,3,gold,100
2,3.0,tyrion,3,wine,20
3,,,4,sword,50


# Full join


In [359]:
sql = '''
SELECT t.*, i.*
FROM users t
FULL JOIN items i ON t.id = i.user_id
'''

In [360]:
select(sql)

Unnamed: 0,id,name,user_id,item_name,value
0,1.0,gleb,1.0,hleb,5.0
1,2.0,jon snow,,,
2,3.0,tyrion,3.0,gold,100.0
3,3.0,tyrion,3.0,wine,20.0
4,,,4.0,sword,50.0


In [522]:


sql = '''select t.*, i.* from users t
left join items i on t.id = i.user_id
union 
select u.*, t.* 
from items t left join users u on t.user_id = u.id
'''
     


In [523]:
select(sql)

Unnamed: 0,id,name,user_id,item_name,value
0,,,4.0,sword,50.0
1,1.0,gleb,1.0,hleb,5.0
2,2.0,jon snow,,,
3,3.0,tyrion,3.0,gold,100.0
4,3.0,tyrion,3.0,wine,20.0


# Inner join


In [362]:
sql = '''
SELECT *
FROM german_credit AS gc 
limit 5
'''

In [363]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358


In [364]:
clients = pd.DataFrame({'client_id':[200,45],'data':[1,2]})

In [365]:
clients.to_sql('clients_task_name',con,index=False,if_exists='replace')

2

In [370]:
sql = '''
SELECT gc.*, ctn.data
FROM german_credit AS gc 
JOIN clients_task_name ctn ON gc.client_id = ctn.client_id
'''

In [371]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id,data
0,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200,1
1,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45,2


In [372]:
sql = '''
SELECT 1 AS user_id
UNION ALL
SELECT 2 AS user_id
UNION ALL
SELECT 3 AS user_id
'''

In [373]:
select(sql)

Unnamed: 0,user_id
0,1
1,2
2,3


In [375]:
sql = '''
SELECT date('2021-03-01') AS month
UNION ALL
SELECT date('2021-04-01') AS month
'''

In [376]:
select(sql)

Unnamed: 0,month
0,2021-03-01
1,2021-04-01


In [377]:
sql = '''
WITH users AS (
    SELECT 1 AS user_id
    UNION ALL
    SELECT 2 AS user_id
    UNION ALL
    SELECT 3 as user_id),

    month AS 
        (SELECT date('2021-03-01') AS month
        UNION ALL
        SELECT date('2021-04-01') AS month)

SELECT *
FROM users t
JOIN month m ON 1=1
'''
     


In [378]:
select(sql)

Unnamed: 0,user_id,month
0,1,2021-03-01
1,2,2021-03-01
2,3,2021-03-01
3,1,2021-04-01
4,2,2021-04-01
5,3,2021-04-01


# Ежемесячный отчет


In [379]:
sql = '''
SELECT *
FROM german_credit AS gc
limit 5'''

In [380]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358


In [381]:
transactions = pd.read_csv('/home/antonius/Projects/DS_Projects/Check-ups/5_sql/german_credit_augmented_transactions.csv')

In [383]:
df.head(2)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id,client_id_2
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210,335
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929,382


In [385]:
transactions.head(2)

Unnamed: 0,dt,client_id,amount
0,2008-04-06 11:54:47,950,161.38
1,2007-07-28 00:00:19,418,35.34


In [386]:
transactions.to_sql('client_transactions',con,index=False,if_exists='replace')

4275

In [388]:
sql = '''
SELECT *
FROM client_transactions t
limit 5
'''

In [389]:
select(sql)

Unnamed: 0,dt,client_id,amount
0,2008-04-06 11:54:47,950,161.38
1,2007-07-28 00:00:19,418,35.34
2,2008-03-14 20:43:54,131,146.5
3,2007-12-18 13:03:24,353,119.21
4,2007-11-09 05:18:30,849,105.24


In [392]:
sql = '''
SELECT count(*)
FROM client_transactions t
limit 5'''

In [393]:
select(sql)

Unnamed: 0,count(*)
0,4275


In [395]:
sql = '''
SELECT date(t.dt, 'start of month') AS month,
    count(1) AS transaction_cnt,
    sum(t.amount) AS amount_sum

FROM client_transactions t
GROUP BY 1
ORDER BY 1
'''

In [396]:
select(sql)

Unnamed: 0,month,transaction_cnt,amount_sum
0,2007-05-01,338,450912.77
1,2007-06-01,379,551664.83
2,2007-07-01,304,494134.5
3,2007-08-01,255,426903.23
4,2007-10-01,332,634846.49
5,2007-11-01,389,500420.98
6,2007-12-01,364,561449.89
7,2008-01-01,413,630137.22
8,2008-02-01,228,337043.47
9,2008-03-01,309,425599.09


In [400]:
min = '''
SELECT date(min(t.dt),'start of month')
FROM client_transactions t
'''

In [401]:
select(min)

Unnamed: 0,"date(min(t.dt),'start of month')"
0,2007-05-01


In [402]:
max = '''
SELECT date(max(t.dt),'start of month')
FROM client_transactions t
'''

In [403]:
select(max)

Unnamed: 0,"date(max(t.dt),'start of month')"
0,2008-06-01


In [405]:
sql = f'''
WITH RECURSIVE dates(month) AS (
  VALUES(({min}))
  UNION ALL
  SELECT date(month, '+1 month')
  FROM dates
  WHERE month < ({max})
)
SELECT t.month FROM dates t
'''

In [406]:
select(sql)

Unnamed: 0,month
0,2007-05-01
1,2007-06-01
2,2007-07-01
3,2007-08-01
4,2007-09-01
5,2007-10-01
6,2007-11-01
7,2007-12-01
8,2008-01-01
9,2008-02-01


In [407]:
sql = f'''
WITH dates(month) AS (
  VALUES(({min}))
  UNION ALL
  SELECT date(month, '+1 month')
  FROM dates
  WHERE month < ({max})
),
trans_month AS

    (SELECT date(t.dt, 'start of month') AS month,
        count(1) as transaction_cnt,
        sum(t.amount) as amount_sum

    FROM client_transactions t
    GROUP BY 1
    ORDER BY 1)


SELECT t.month,
    coalesce(tm.transaction_cnt,0) AS transaction_cnt,
    coalesce(tm.amount_sum,0) AS amount_sum

FROM dates t
LEFT JOIN trans_month tm ON t.month = tm.month
ORDER BY t.month
'''

In [408]:
select(sql)

Unnamed: 0,month,transaction_cnt,amount_sum
0,2007-05-01,338,450912.77
1,2007-06-01,379,551664.83
2,2007-07-01,304,494134.5
3,2007-08-01,255,426903.23
4,2007-09-01,0,0.0
5,2007-10-01,332,634846.49
6,2007-11-01,389,500420.98
7,2007-12-01,364,561449.89
8,2008-01-01,413,630137.22
9,2008-02-01,228,337043.47


# Ежемесячный отчет на пользователя


In [409]:
sql = '''
SELECT DISTINCT gc.client_id 
FROM german_credit AS gc
'''

In [410]:
select(sql)

Unnamed: 0,client_id
0,210
1,929
2,200
3,45
4,358
...,...
995,624
996,181
997,730
998,557


In [411]:
sql = '''
SELECT date(ct.dt, 'start of month') AS month, ct.client_id,
    count(1) AS transaction_cnt,
    sum(ct.amount) AS amount_sum

FROM client_transactions AS ct
GROUP BY 1,2
ORDER BY 1
'''

In [412]:
select(sql)

Unnamed: 0,month,client_id,transaction_cnt,amount_sum
0,2007-05-01,101,1,149.67
1,2007-05-01,107,2,217.50
2,2007-05-01,110,1,1081.32
3,2007-05-01,111,1,139.97
4,2007-05-01,113,1,1051.29
...,...,...,...,...
3551,2008-06-01,983,1,10050.37
3552,2008-06-01,987,1,192.48
3553,2008-06-01,992,1,-610.21
3554,2008-06-01,996,1,121.28


In [413]:
sql = f'''
WITH dates(month) AS (
  VALUES(({min}))
  UNION ALL
  SELECT date(month, '+1 month')
  FROM dates
  WHERE month < ({max})
),

clients AS (SELECT DISTINCT gc.client_id 
            FROM german_credit AS gc),

clients_month AS
    (SELECT gc.month, c.client_id 
    FROM dates AS gc
    JOIN clients c on 1=1),

trans_month as 
    (SELECT date(gc.dt, 'start of month') AS month, gc.client_id,
        count(1) AS transaction_cnt,
        sum(gc.amount) AS amount_sum

FROM client_transactions AS gc
GROUP BY 1,2
ORDER BY 1),

client_trans_month AS (
    select t.*,
    tm.transaction_cnt,
    tm.amount_sum,
    1 as user,
        case
            when tm.transaction_cnt > 0 then 1 
            else 0 end AS active

FROM clients_month AS gc
LEFT JOIN trans_month tm ON gc.client_id = tm.client_id
AND gc.month = tm.month
)

SELECT gc.month, sum(gc.user) AS user_cnt, sum(gc.amount_sum) AS amount_sum , 
    sum(gc.active) AS active_cnt 
FROM client_trans_month AS gc
GROUP BY gc.month
'''

In [414]:
t = select(sql)

DatabaseError: Execution failed on sql '
WITH dates(month) AS (
  VALUES((
SELECT date(min(t.dt),'start of month')
FROM client_transactions t
))
  UNION ALL
  SELECT date(month, '+1 month')
  FROM dates
  WHERE month < (
SELECT date(max(t.dt),'start of month')
FROM client_transactions t
)
),

clients AS (SELECT DISTINCT gc.client_id 
            FROM german_credit AS gc),

clients_month AS
    (SELECT gc.month, c.client_id 
    FROM dates AS gc
    JOIN clients c on 1=1),

trans_month as 
    (SELECT date(gc.dt, 'start of month') AS month, gc.client_id,
        count(1) AS transaction_cnt,
        sum(gc.amount) AS amount_sum

FROM client_transactions AS gc
GROUP BY 1,2
ORDER BY 1),

client_trans_month AS (
    select t.*,
    tm.transaction_cnt,
    tm.amount_sum,
    1 as user,
        case
            when tm.transaction_cnt > 0 then 1 
            else 0 end AS active

FROM clients_month AS gc
LEFT JOIN trans_month tm ON gc.client_id = tm.client_id
AND gc.month = tm.month
)

SELECT gc.month, sum(gc.user) AS user_cnt, sum(gc.amount_sum) AS amount_sum , 
    sum(gc.active) AS active_cnt 
FROM client_trans_month AS gc
GROUP BY gc.month
': no such table: t

In [417]:
sql = '''
select sum(t.amount)
from client_transactions AS t
'''

In [419]:
select(sql)

Unnamed: 0,sum(t.amount)
0,6548980.62


In [420]:
t['amount_sum'].sum()

KeyError: 'amount_sum'

In [None]:
t

In [421]:
2 * 365 * 10000

7300000

# Джойн таблицы самой на себя (нарастающий итог)


In [422]:
t = pd.DataFrame({'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03'],format='%Y-%m-%d'),
                  'revenue':[1,2,3]})

In [423]:
t

Unnamed: 0,dt,revenue
0,2021-04-01,1
1,2021-04-02,2
2,2021-04-03,3


In [570]:
t.to_sql('revenue',con,index=False,if_exists='replace')

3

In [426]:
sql = '''
SELECT t.dt,t.revenue, sum(r.revenue) AS cumsum 
FROM revenue t
JOIN revenue r ON r.dt <= t.dt 
GROUP BY t.dt, t.revenue
'''

In [427]:
select(sql)

Unnamed: 0,dt,revenue,cumsum
0,1,1,4
1,1,3,4
2,2,2,10
3,2,4,10
4,3,3,18
5,3,5,18
6,4,4,28
7,4,6,28
8,5,5,40
9,5,7,40


# Оконные функции


In [428]:
sql = '''
SELEct t.*,
    sum(t.revenue) OVER (ORDER BY t.dt) AS cum_sum
FROM revenue t
'''

In [429]:
select(sql)

Unnamed: 0,user_id,dt,revenue,cum_sum
0,1,1,1,4
1,2,1,3,4
2,1,2,2,10
3,2,2,4,10
4,1,3,3,18
5,2,3,5,18
6,1,4,4,28
7,2,4,6,28
8,1,5,5,40
9,2,5,7,40


In [430]:
t = pd.DataFrame({'user_id':[1,1,1,2,2,2],'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03',
                                                               '2021-04-01','2021-04-02','2021-04-03'],format='%Y-%m-%d'),
                  'revenue':[1,2,3,2,3,4]})

In [431]:
t

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-01,1
1,1,2021-04-02,2
2,1,2021-04-03,3
3,2,2021-04-01,2
4,2,2021-04-02,3
5,2,2021-04-03,4


In [432]:
t.to_sql('revenue',con,index=False,if_exists='replace')

6

In [433]:
sql = '''
SELECT t.*,
    sum(t.revenue) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS cum_sum
FROM revenue t
'''

In [434]:
select(sql)

Unnamed: 0,user_id,dt,revenue,cum_sum
0,1,2021-04-01 00:00:00,1,1
1,1,2021-04-02 00:00:00,2,3
2,1,2021-04-03 00:00:00,3,6
3,2,2021-04-01 00:00:00,2,2
4,2,2021-04-02 00:00:00,3,5
5,2,2021-04-03 00:00:00,4,9


## Rank и row_number


In [435]:
t = pd.DataFrame({'user_id':[1,1,1,1,2,2,2],'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03','2021-04-03',
                                                               '2021-04-03','2021-04-04','2021-04-05'],format='%Y-%m-%d'),
                  'revenue':[1,2,3,1,2,3,4]})

In [436]:
t.to_sql('revenue',con,index=False,if_exists='replace')

7

In [437]:
sql = '''
SELECT *
FROM revenue t
'''

In [438]:
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-01 00:00:00,1
1,1,2021-04-02 00:00:00,2
2,1,2021-04-03 00:00:00,3
3,1,2021-04-03 00:00:00,1
4,2,2021-04-03 00:00:00,2
5,2,2021-04-04 00:00:00,3
6,2,2021-04-05 00:00:00,4


In [439]:
sql = '''
SELECT t.*,

rank() OVER (PARTITION BY t.user_id ORDER BY t.dt DESC) AS rnk

FROM revenue t
'''

In [440]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03 00:00:00,3,1
1,1,2021-04-03 00:00:00,1,1
2,1,2021-04-02 00:00:00,2,3
3,1,2021-04-01 00:00:00,1,4
4,2,2021-04-05 00:00:00,4,1
5,2,2021-04-04 00:00:00,3,2
6,2,2021-04-03 00:00:00,2,3


In [441]:
sql = '''
WITH dt_rank AS (

SELECT t.*,

rank() OVER (PARTITION BY t.user_id ORDER BY t.dt DESC) AS rnk

FROM revenue t)
 
SELECT * FROM dt_rank t
WHERE t.rnk = 1
'''

In [442]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03 00:00:00,3,1
1,1,2021-04-03 00:00:00,1,1
2,2,2021-04-05 00:00:00,4,1


In [443]:
sql = '''
SELECT t.*,

row_number() OVER (PARTITION BY t.user_id ORDER BY t.dt DESC) AS rnk

FROM revenue t
'''

In [444]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03 00:00:00,3,1
1,1,2021-04-03 00:00:00,1,2
2,1,2021-04-02 00:00:00,2,3
3,1,2021-04-01 00:00:00,1,4
4,2,2021-04-05 00:00:00,4,1
5,2,2021-04-04 00:00:00,3,2
6,2,2021-04-03 00:00:00,2,3


In [593]:
sql = '''
with dt_rank as (

select t.*,

row_number() over (partition by t.user_id order by t.dt desc) as rnk

 from revenue t)
 
 select * from dt_rank t
where t.rnk = 1
 
 '''

In [445]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03 00:00:00,3,1
1,1,2021-04-03 00:00:00,1,2
2,1,2021-04-02 00:00:00,2,3
3,1,2021-04-01 00:00:00,1,4
4,2,2021-04-05 00:00:00,4,1
5,2,2021-04-04 00:00:00,3,2
6,2,2021-04-03 00:00:00,2,3


In [446]:
t = pd.DataFrame({'user_id':[1,1,1,2,2,2],'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03',
                                                               '2021-04-03','2021-04-04','2021-04-05'],format='%Y-%m-%d'),
                  'revenue':[1,2,3,2,3,4]})

In [447]:
t.to_sql('revenue',con,index=False,if_exists='replace')

6

In [448]:
sql = '''
SELECT t.user_id, max(t.dt) AS max_dt 
FROM revenue  AS t
GROUP BY t.user_id
'''     

In [449]:
select(sql)

Unnamed: 0,user_id,max_dt
0,1,2021-04-03 00:00:00
1,2,2021-04-05 00:00:00


In [450]:
sql = '''
WITH last_dt AS (
    SELECT t.user_id, max(t.dt) AS max_dt
    FROM revenue  AS t
    GROUP BY t.user_id)

SELECT t.*
FROM revenue AS t
JOIN last_dt ld ON t.user_id = ld.user_id AND t.dt = ld.max_dt
'''

In [451]:
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-03 00:00:00,3
1,2,2021-04-05 00:00:00,4


# Топ 3 зарплаты в отделе


In [452]:
t = pd.DataFrame({'dep':['a','a','a','a','a',
                         'b','b','b','b','b'],
                  'emp':['aa','bb','cc','dd','ee',
                         'aa','bb','cc','dd','ee'],
                  'sal':[5,5,3,2,1,
                         5,4,3,2,1]})

In [453]:
t

Unnamed: 0,dep,emp,sal
0,a,aa,5
1,a,bb,5
2,a,cc,3
3,a,dd,2
4,a,ee,1
5,b,aa,5
6,b,bb,4
7,b,cc,3
8,b,dd,2
9,b,ee,1


In [454]:
t.to_sql('salary',con,index=False,if_exists='replace')

10

In [455]:
sql = '''
SELECT t.*,
rank() OVER (PARTITION BY t.dep ORDER BY t.sal DESC) AS rnk_rank,
dense_rank() OVER (PARTITION BY t.dep ORDER BY t.sal DESC) AS rnk

FROM salary t
'''

In [456]:
select(sql)

Unnamed: 0,dep,emp,sal,rnk_rank,rnk
0,a,aa,5,1,1
1,a,bb,5,1,1
2,a,cc,3,3,2
3,a,dd,2,4,3
4,a,ee,1,5,4
5,b,aa,5,1,1
6,b,bb,4,2,2
7,b,cc,3,3,3
8,b,dd,2,4,4
9,b,ee,1,5,5


In [457]:
sql = '''
WITH salary_rnk as (
    SELECT t.*,
        dense_rank() OVER (PARTITION BY t.dep ORDER BY t.sal DESC) AS rnk
    FROM salary t
    )
 
SELECT *
FROM salary_rnk AS t

WHERE t.rnk <= 3
'''     

In [458]:
select(sql)

Unnamed: 0,dep,emp,sal,rnk
0,a,aa,5,1
1,a,bb,5,1
2,a,cc,3,2
3,a,dd,2,3
4,b,aa,5,1
5,b,bb,4,2
6,b,cc,3,3


# Клиентские сессии


In [459]:
user1 = pd.DataFrame({'user_id':[1,1,1,1,1],
                  'dt':pd.to_datetime(['2021-04-01 07:31','2021-04-01 07:35',
                                       '2021-04-01 08:20','2021-04-01 12:31',
                                       '2021-04-03 07:31'],format='%Y-%m-%d %H:%M')})

In [460]:
user1

Unnamed: 0,user_id,dt
0,1,2021-04-01 07:31:00
1,1,2021-04-01 07:35:00
2,1,2021-04-01 08:20:00
3,1,2021-04-01 12:31:00
4,1,2021-04-03 07:31:00


In [461]:
user2 = pd.DataFrame({'user_id':[2,2,2,2],
                  'dt':pd.to_datetime(['2021-04-01 07:31','2021-04-01 07:35',
                                       '2021-04-01 08:20','2021-04-01 9:10',
                                       ],format='%Y-%m-%d %H:%M')})

In [462]:
user3 = pd.DataFrame({'user_id':[3,3,3],
                  'dt':pd.to_datetime(['2021-04-01 07:31','2021-04-02 07:35',
                                       '2021-04-03 08:20'
                                       ],format='%Y-%m-%d %H:%M')})

In [463]:
t = pd.concat([user1,user2,user3])

In [464]:
t

Unnamed: 0,user_id,dt
0,1,2021-04-01 07:31:00
1,1,2021-04-01 07:35:00
2,1,2021-04-01 08:20:00
3,1,2021-04-01 12:31:00
4,1,2021-04-03 07:31:00
0,2,2021-04-01 07:31:00
1,2,2021-04-01 07:35:00
2,2,2021-04-01 08:20:00
3,2,2021-04-01 09:10:00
0,3,2021-04-01 07:31:00


In [465]:
t.to_sql('client_log',con,index=False,if_exists='replace')

12

In [468]:
sql = '''
SELECT *
FROM client_log t
'''

In [469]:
select(sql)

Unnamed: 0,user_id,dt
0,1,2021-04-01 07:31:00
1,1,2021-04-01 07:35:00
2,1,2021-04-01 08:20:00
3,1,2021-04-01 12:31:00
4,1,2021-04-03 07:31:00
5,2,2021-04-01 07:31:00
6,2,2021-04-01 07:35:00
7,2,2021-04-01 08:20:00
8,2,2021-04-01 09:10:00
9,3,2021-04-01 07:31:00


In [470]:
24 * 60 * 60

86400

In [477]:
sql = '''
WITH new_session AS (
    SELECT t.*,
        lag(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS prev_dt,
round((julianday(t.dt) - julianday(lag(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * 24 * 60 * 60) AS dt_diff,


CASE 
    WHEN round((julianday(t.dt) - julianday(lag(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * 24 * 60 * 60) >= 3600
    THEN 1 else 0 end as new_session

FROM client_log AS t),

client_sessions AS (
    SELECT t.*, 
        sum(t.new_session) over (partition by t.user_id order by t.dt) as session_id
 
    FROM new_session AS t),
        client_sessions_agg AS (
          SELECT t.user_id, t.session_id, count(1) AS action_cnt 
          FROM client_sessions AS t
          GROUP BY t.user_id, t.session_id)


SELECT count(*) 
FROM client_sessions_agg AS t
'''

In [478]:
select(sql)

Unnamed: 0,count(*)
0,7


# Скользящее среднее


In [479]:
t = pd.DataFrame({'user_id':[1,1,1,1,1,1,
                             2,2,2,2,2],
                  'dt':[1,2,3,4,5,6,
                        1,2,3,4,5],
                  'revenue':[1,2,3,4,5,6,
                             3,4,5,6,7]})

In [480]:
t.to_sql('revenue',con,index=False,if_exists='replace')

11

In [481]:
sql = '''
SELECT t.*,
    avg(t.revenue) OVER (PARTITION BY t.user_id ORDER BY t.dt
    rows between 2 preceding and current row
    ) AS moving_avg

FROM revenue AS t
'''

In [482]:
select(sql)

Unnamed: 0,user_id,dt,revenue,moving_avg
0,1,1,1,1.0
1,1,2,2,1.5
2,1,3,3,2.0
3,1,4,4,3.0
4,1,5,5,4.0
5,1,6,6,5.0
6,2,1,3,3.0
7,2,2,4,3.5
8,2,3,5,4.0
9,2,4,6,5.0
