# Working notebook with exaples

In [88]:
import pandas as pd
import sqlite3
import numpy as np

In [2]:
con = sqlite3.connect('C:/Users/79295/Desktop/GITHUB/SQL/SQL для анализа данных/databases/german_credit.db')

In [17]:
df = pd.read_csv('C:/Users/79295/Desktop/GITHUB/SQL/SQL для анализа данных/datasets/german_credit_augmented.csv')

In [4]:
df.contract_dt = pd.to_datetime(df.contract_dt, format='%Y-%m-%d %H:%M:%S')

In [5]:
# Выгружаем нашу табличку в базу данных

df.to_sql('german_credit', con, index=False,if_exists='replace')

1000

In [6]:
# Заворачиваем функцию  pd.read_sql(sql,con) во вспомогательную функцию select(sql), чтобы меньше печатать 

def select(sql):
    return pd.read_sql(sql,con)

In [35]:
sql = '''select
            t.*,
            t.age * 3 as age_mult3
        from german_credit t
        where t.age < 23 
            and t.purpose = "car"
            and t.housing = "own"
        '''
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id,age_mult3
0,22,male,2,own,rich,moderate,1007,12,car,0,2007-07-17 12:33:24,67,66


In [36]:
sql = '''select
        t.*,
        t.age * 3 as age_mult3
        from german_credit t
        where t.age < 22 
        and t.purpose = "car"
        and t.housing in ("own","rent")
        '''
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id,age_mult3
0,21,female,2,rent,little,,2570,27,car,1,2008-06-14 22:03:03,724,63
1,20,female,2,rent,,,4675,12,car,0,2008-06-06 19:31:48,296,60
2,21,male,2,rent,little,moderate,2779,18,car,0,2008-06-06 15:51:39,425,63
3,20,female,2,rent,rich,,3186,15,car,0,2008-05-06 18:34:56,420,60
4,20,female,1,rent,little,moderate,2718,24,car,1,2008-04-12 12:34:40,593,60


# Saving query result to new table

In [39]:
cur = con.cursor()

In [50]:
# Создадим запрос в котором сначала будем дропать табличку, чтобы оставалась возможность перезапуска

In [46]:
sql = '''
    drop table if exists my_table; 
    create table 
        my_table as 
    select
        t.*,
        t.age * 3 as age_mult3
    from 
        german_credit t
    where t.age < 22 
        and t.purpose = "car"
        and t.housing in ("own","rent")
    '''

In [47]:
cur.executescript(sql)

<sqlite3.Cursor at 0x246ab089110>

In [48]:
sql = '''
select *
from my_table
'''

In [49]:
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id,age_mult3
0,21,female,2,rent,little,,2570,27,car,1,2008-06-14 22:03:03,724,63
1,20,female,2,rent,,,4675,12,car,0,2008-06-06 19:31:48,296,60
2,21,male,2,rent,little,moderate,2779,18,car,0,2008-06-06 15:51:39,425,63
3,20,female,2,rent,rich,,3186,15,car,0,2008-05-06 18:34:56,420,60
4,20,female,1,rent,little,moderate,2718,24,car,1,2008-04-12 12:34:40,593,60


# Union all

In [53]:
jan = pd.DataFrame({"month":['jan','jan'],"revenue": [100, 150]})
feb = pd.DataFrame({"month":['feb','feb'],"revenue": [120, 160]})

In [56]:
jan.to_sql('jan',con,index=False,if_exists='replace')
feb.to_sql('feb',con,index=False,if_exists='replace')

2

In [61]:
sql='''
select * 
from jan t
union all
select *
from feb t
'''

In [62]:
select(sql)

Unnamed: 0,month,revenue
0,jan,100
1,jan,150
2,feb,120
3,feb,160


# Getting string letters

In [67]:
# substr(column,x,y) где x - порядковый номер символа в строке, y - кол-во символов
sql = '''select
        substr(t.sex,1,1)
        from german_credit t
        
        '''
select(sql)

Unnamed: 0,"substr(t.sex,1,1)"
0,m
1,m
2,m
3,f
4,m
...,...
995,m
996,m
997,m
998,f


# Group by

In [72]:
sql='''
select 
t.sex, 
    count(1) as total_count, 
        round(avg(credit_amount),2) as avg_credit
from
    german_credit t
group by 
    t.sex
'''
select(sql)

Unnamed: 0,sex,total_count,avg_credit
0,female,310,2877.77
1,male,690,3448.04


# Null / NaN values

In [75]:
sql = '''
select count(t.checking_account),count(1)
from
german_credit t

'''
select(sql)

Unnamed: 0,count(t.checking_account),count(1)
0,606,1000


In [76]:
sql='''
select 
    t.checking_account, 
    count(1) as total_count, 
    round(avg(credit_amount),2) as avg_credit
from
    german_credit t
group by 
    t.checking_account
'''
select(sql)

Unnamed: 0,checking_account,total_count,avg_credit
0,,394,3133.1
1,little,274,3175.22
2,moderate,269,3827.56
3,rich,63,2177.65


In [78]:
df.groupby('checking_account').credit_amount.count()

checking_account
little      274
moderate    269
rich         63
Name: credit_amount, dtype: int64

In [79]:
df.groupby('checking_account',dropna=False).credit_amount.count()

checking_account
little      274
moderate    269
rich         63
NaN         394
Name: credit_amount, dtype: int64

In [86]:
sql='''
select 
    t.checking_account,
    sum(case when t.checking_account is null then 1 else 0 end) as count_none
from
    german_credit t
'''
select(sql)

Unnamed: 0,checking_account,count_none
0,,394


In [89]:
t = pd.DataFrame({'col1':[1,np.nan,2]})

In [90]:
t

Unnamed: 0,col1
0,1.0
1,
2,2.0


In [91]:
t=t.to_sql('null_test',con,index=False,if_exists='replace')

In [92]:
sql = '''
select *
from
null_test t
'''
select(sql)

Unnamed: 0,col1
0,1.0
1,
2,2.0


In [93]:
sql = '''
select 
count(t.col1)
from
null_test t
'''
select(sql)

Unnamed: 0,count(t.col1)
0,2


In [94]:
sql = '''
select 
count(1)
from
null_test t
'''
select(sql)

Unnamed: 0,count(1)
0,3


In [95]:
sql = '''
select 
sum(t.col1)
from
null_test t
'''
select(sql)

Unnamed: 0,sum(t.col1)
0,3.0


In [96]:
sql = '''
select 
avg(t.col1)
from
null_test t
'''
select(sql)

Unnamed: 0,avg(t.col1)
0,1.5


In [97]:
# Заменим пропуски

In [98]:
sql = '''
select 
*
from
german_credit t
'''
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358
...,...,...,...,...,...,...,...,...,...,...,...,...
995,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
996,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
997,33,male,2,own,little,moderate,6403,24,radio/TV,0,2008-04-08 03:24:26,730
998,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


In [101]:
sql = '''
select 
t.checking_account,
coalesce(t.checking_account,'no_info')
from
german_credit t
'''
select(sql)

Unnamed: 0,checking_account,"coalesce(t.checking_account,'no_info')"
0,,no_info
1,little,little
2,,no_info
3,,no_info
4,,no_info
...,...,...
995,little,little
996,moderate,moderate
997,moderate,moderate
998,,no_info


In [102]:
sql = '''
select 
coalesce(col1,0)
from
null_test t
'''
select(sql)

Unnamed: 0,"coalesce(col1,0)"
0,1.0
1,0.0
2,2.0


# coalesce по строке

In [107]:
t = pd.DataFrame({'col1':[1,np.nan,2],
                 'col2':[np.nan,np.nan,1],
                 'col3':[2,3,7]})

In [108]:
t=t.to_sql('null_test',con,index=False,if_exists='replace')

In [109]:
sql = '''
select *
from
null_test t
'''
select(sql)

Unnamed: 0,col1,col2,col3
0,1.0,,2
1,,,3
2,2.0,1.0,7


In [110]:
sql = '''
select 
coalesce(t.col1, t.col2, t.col3)
from
null_test t
'''
select(sql)

Unnamed: 0,"coalesce(t.col1, t.col2, t.col3)"
0,1.0
1,3.0
2,2.0


In [111]:
# Видим, что запрос построчно нашел первые  не NaN значения и вывел их

# Дубликаты

In [112]:
t = pd.DataFrame({'id':[1,1,2],
                 'name':['a', 'a', 'b']})

In [113]:
t=t.to_sql('dupl_test',con,index=False,if_exists='replace')

In [114]:
sql = '''
select *
from
dupl_test t
'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,a
2,2,b


In [115]:
# имеем 2 полных дубликата с индексами 0 и 1
# Сгруппируем по всем полям

In [116]:
sql = '''
select t.id, t.name, count(1) as cnt
from
dupl_test t
group by t.id, t.name
'''
select(sql)

Unnamed: 0,id,name,cnt
0,1,a,2
1,2,b,1


In [119]:
sql = '''
select 
    t.id, t.name, count(1) as cnt
from
    dupl_test t
group by 
    t.id, t.name
having cnt > 1
'''
select(sql)

Unnamed: 0,id,name,cnt
0,1,a,2


In [120]:
t = pd.DataFrame({'id':[1,1,2,2,3],
                 'name':['a', 'b', 'c', 'd', 'e']})

In [121]:
t

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [122]:
t=t.to_sql('dupl_test2',con,index=False,if_exists='replace')

In [123]:
sql = '''
select *
from
dupl_test2 t

'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [124]:
sql = '''
select 
    t.id, t.name, count(1) as cnt
from
    dupl_test2 t
group by 
    t.id, t.name
having cnt > 1
'''
select(sql)

Unnamed: 0,id,name,cnt


In [126]:
# Найдем повторяющиеся id

In [127]:
sql = '''
select 
    t.id, count(1) as cnt
from
    dupl_test2 t
group by 
    t.id
having cnt > 1
'''
select(sql)

Unnamed: 0,id,cnt
0,1,2
1,2,2


In [128]:
sql = '''
select *
from
dupl_test2 t
where t.id in(1,2)

'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


In [130]:
sql = '''
select 
    t.id
from
    dupl_test2 t
group by 
    t.id
having count(1) > 1
'''
select(sql)

Unnamed: 0,id
0,1
1,2


In [131]:
sql = '''
select *
from
dupl_test2 t
where t.id in(  select t.id
                from
                dupl_test2 t
                group by t.id
                having count(1) > 1)

'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


In [133]:
# Аггрегация договоров по месяцам

In [132]:
sql = '''
select *
from
german_credit
'''
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358
...,...,...,...,...,...,...,...,...,...,...,...,...
995,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
996,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
997,33,male,2,own,little,moderate,6403,24,radio/TV,0,2008-04-08 03:24:26,730
998,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


In [141]:
sql = '''
select 
    date(t.contract_dt,'start of month') as month,
    count(1),
    sum(t.credit_amount) as credit_amount_sum,
    round(avg(t.credit_amount),2) as credit_amount_avg,
    count(distinct t.client_id) as client_id_unique
from
    german_credit t
group by 
    date(t.contract_dt,'start of month')
order by 
    date(t.contract_dt,'start of month')
'''
select(sql)

Unnamed: 0,month,count(1),credit_amount_sum,credit_amount_avg,client_id_unique
0,2007-05-01,81,207663,2563.74,81
1,2007-06-01,74,239594,3237.76,74
2,2007-07-01,71,224333,3159.62,71
3,2007-08-01,57,178569,3132.79,57
4,2007-09-01,58,186909,3222.57,58
5,2007-10-01,70,188534,2693.34,70
6,2007-11-01,87,300504,3454.07,87
7,2007-12-01,77,273973,3558.09,77
8,2008-01-01,93,288080,3097.63,93
9,2008-02-01,55,211128,3838.69,55


# Intervals

In [148]:
sql = '''
select 
    t.credit_amount,
    case 
        when t.credit_amount < 1000 then '1. <1000'
        when t.credit_amount < 2000 then '2. 1000-2000'
        when t.credit_amount < 3000 then '3. 2000-3000'
        when t.credit_amount >= 3000 then '4. >=3000'
        else 'other' 
    end as credeit_amount_bin
from 
    german_credit t

'''
select(sql)

Unnamed: 0,credit_amount,credeit_amount_bin
0,3074,4. >=3000
1,1344,2. 1000-2000
2,936,1. <1000
3,1393,2. 1000-2000
4,776,1. <1000
...,...,...
995,2600,3. 2000-3000
996,4455,4. >=3000
997,6403,4. >=3000
998,5003,4. >=3000


In [154]:
sql = '''
select
    case 
        when t.credit_amount < 1000 then '1. <1000'
        when t.credit_amount < 2000 then '2. 1000-2000'
        when t.credit_amount < 3000 then '3. 2000-3000'
        when t.credit_amount >= 3000 then '4. >=3000'
        else 'other' 
    end as credeit_amount_bin,
    count(1) as credit_cnt
from german_credit t
    group by 
        credeit_amount_bin
'''
select(sql)

Unnamed: 0,credeit_amount_bin,credit_cnt
0,1. <1000,116
1,2. 1000-2000,316
2,3. 2000-3000,188
3,4. >=3000,380


# Pivot table

In [157]:
sql = '''
select
    t.housing,
    count(1) as cnt
from german_credit t
group by 
    t.housing

'''
select(sql)

Unnamed: 0,housing,cnt
0,free,108
1,own,713
2,rent,179


In [163]:
sql = '''
select
    t.housing,
    count(case when t.sex='female' then 1 else null end) as female,
    count(case when t.sex='male' then 1 else null end) as male,
    count(1) as cnt
from german_credit t
group by 
    t.housing
'''
select(sql)

Unnamed: 0,housing,female,male,cnt
0,free,19,89,108
1,own,196,517,713
2,rent,95,84,179


In [166]:
sql = '''
select
distinct t.purpose
from german_credit t
'''
purpose = list(select(sql)['purpose'].values)

In [167]:
purpose

['radio/TV',
 'car',
 'education',
 'furniture/equipment',
 'repairs',
 'business',
 'domestic appliances',
 'vacation/others']

In [181]:
for p in purpose:
    print(f"count(case when t.purpose = '{p}' then 1 else null end) as {p.lower().replace(' ','').replace('/','')},")

count(case when t.purpose = 'radio/TV' then 1 else null end) as radiotv,
count(case when t.purpose = 'car' then 1 else null end) as car,
count(case when t.purpose = 'education' then 1 else null end) as education,
count(case when t.purpose = 'furniture/equipment' then 1 else null end) as furnitureequipment,
count(case when t.purpose = 'repairs' then 1 else null end) as repairs,
count(case when t.purpose = 'business' then 1 else null end) as business,
count(case when t.purpose = 'domestic appliances' then 1 else null end) as domesticappliances,
count(case when t.purpose = 'vacation/others' then 1 else null end) as vacationothers,


In [184]:
sql = '''
select
    t.housing,
    count(case when t.purpose = 'radio/TV' then 1 else null end) as radiotv,
    count(case when t.purpose = 'car' then 1 else null end) as car,
    count(case when t.purpose = 'education' then 1 else null end) as education,
    count(case when t.purpose = 'furniture/equipment' then 1 else null end) as furnitureequipment,
    count(case when t.purpose = 'repairs' then 1 else null end) as repairs,
    count(case when t.purpose = 'business' then 1 else null end) as business,
    count(case when t.purpose = 'domestic appliances' then 1 else null end) as domesticappliances,
    count(case when t.purpose = 'vacation/others' then 1 else null end) as vacationothers,
    count(1) as cnt
from german_credit t
group by 
    t.housing
'''
select(sql)

Unnamed: 0,housing,radiotv,car,education,furnitureequipment,repairs,business,domesticappliances,vacationothers,cnt
0,free,15,55,15,11,3,5,0,4,108
1,own,227,219,34,122,17,76,10,8,713
2,rent,38,63,10,48,2,16,2,0,179


# Подзапросы

In [196]:
t = pd.DataFrame({'id':[1,1,2,2,3],
                 'name':['a', 'b', 'c', 'd', 'e']})

In [197]:
t

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [198]:
t = t.to_sql('dupl_test',con,index=False,if_exists='replace')

In [199]:
sql = '''
select *
from
dupl_test t

'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [203]:
sql = '''
select 
    t.id,
    count(1)
from
    dupl_test t
group by t.id
having 
    count(1) > 1

'''
select(sql)

Unnamed: 0,id,count(1)
0,1,2
1,2,2


In [204]:
sql = '''
select 
    t.id
from
    dupl_test t
group by t.id
having 
    count(1) > 1

'''
select(sql)

Unnamed: 0,id
0,1
1,2


In [205]:
sql = '''
select *
from
dupl_test t
where t.id in (select 
                    t.id
                from
                    dupl_test t
                group by t.id
                having 
                    count(1) > 1)

'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


*тоже самое можно сделать сохранив результат подзапроса в промежуточную таблицу, а затем обратиться к ней*

In [207]:
sql = '''
drop table if exists dupls;
create table dupls as 
    select 
        t.id
    from
        dupl_test t
    group by t.id
    having 
        count(1) > 1
'''


In [208]:
cur.executescript(sql)

<sqlite3.Cursor at 0x246ab089110>

In [210]:
sql = '''
select * from dupls
'''
select(sql)

Unnamed: 0,id
0,1
1,2


In [211]:
sql = '''
select *
from
dupl_test t
where t.id in dupls

'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


# CTE (with)

In [215]:
sql = '''
select 
    * 
from 
    (select 
        * 
    from
        (select 
            t.id, count(1) as cnt 
        from 
            dupl_test t 
        group by t.id) t
    where t.cnt > 1) t

where t.id = 1

'''
select(sql)

Unnamed: 0,id,cnt
0,1,2


*перепишем это с использованием CTE*

In [220]:
sql = '''
with id_cnt as 
            (select t.id, count(1) as cnt 
            from dupl_test t 
            group by t.id), 

id_cnt_2 as (select *
            from id_cnt t
            where t.cnt > 1)
select * 
from id_cnt_2 t
where t.id = 1
'''
select(sql)

Unnamed: 0,id,cnt
0,1,2


# Join


In [221]:
users = pd.DataFrame({"id":[1,2,3,],
                    "name":['aaron', 'sergo', 'ivan']})

In [233]:
items = pd.DataFrame({"user_id":[1,2,2,],
                    "item_name":['beer', 'pizza', 'vodka'],
                     'value':[100, 200,120]})

In [241]:
users.to_sql('users', con,index=False, if_exists='replace')
items.to_sql('items', con,index=False, if_exists='replace')

3

In [235]:
sql = '''
select *
from users t 
'''
select(sql)

Unnamed: 0,id,name
0,1,aaron
1,2,sergo
2,3,ivan


In [236]:
sql = '''
select t.*, i.item_name
from users t 
left join items i
on t.id = i.user_id
'''
select(sql)

Unnamed: 0,id,name,item_name
0,1,aaron,beer
1,2,sergo,pizza
2,2,sergo,vodka
3,3,ivan,


In [238]:
sql = '''
select t.*, i.item_name, i.value
from users t 
left join items i
on t.id = i.user_id
where i.value is not null
'''
select(sql)

Unnamed: 0,id,name,item_name,value
0,1,aaron,beer,100
1,2,sergo,pizza,200
2,2,sergo,vodka,120


In [239]:
sql = '''
select t.*, i.item_name, i.value
from users t 
left join items i
on t.id = i.user_id

'''
select(sql)

Unnamed: 0,id,name,item_name,value
0,1,aaron,beer,100.0
1,2,sergo,pizza,200.0
2,2,sergo,vodka,120.0
3,3,ivan,,


In [240]:
users = pd.DataFrame({"id":[1,2,3,],
                    "name":['aaron', 'sergo', 'ivan'],
                     'victory':[2,10,1]})

In [242]:
users.to_sql('users', con,index=False, if_exists='replace')

3

In [244]:
sql = '''
select t.*, i.item_name, i.value, i.user_id
from users t 
left join items i
on t.id = i.user_id
'''
select(sql)

Unnamed: 0,id,name,victory,item_name,value,user_id
0,1,aaron,2,beer,100.0,1.0
1,2,sergo,10,pizza,200.0,2.0
2,2,sergo,10,vodka,120.0,2.0
3,3,ivan,1,,,


In [245]:
t = select(sql)

In [246]:
t['victory'].sum()

23

In [247]:
sql = '''
select sum(victory)
from users t 

'''
select(sql)

Unnamed: 0,sum(victory)
0,13


* необходимо проверять на дубликаты
* аггрегировать перед джоином

In [251]:
sql = '''
select 
t.user_id, count(t.item_name) as item_count,
sum(t.value) as value_sum
from items t 
group by t.user_id

'''
select(sql)

Unnamed: 0,user_id,item_count,value_sum
0,1,1,100
1,2,2,320


In [258]:
sql = '''
with items_agg as (
    select 
    t.user_id, count(t.item_name) as item_count,
    sum(t.value) as value_sum
    from items t 
    group by t.user_id
    )
select 
t.id, t.name, t.victory,
coalesce(i.item_count,0) as item_counn,
coalesce(i.value_sum,0) as value_sum
from users t 
left join items_agg i on
i.user_id = t.id

'''
select(sql)

Unnamed: 0,id,name,victory,item_counn,value_sum
0,1,aaron,2,1,100
1,2,sergo,10,2,320
2,3,ivan,1,0,0
