## comdb2

The Data Science Platform supports interacting with various comdb2 tiers. When you launch a job and include a [comdb2 Identity](https://tutti.prod.bloomberg.com/data-science-platform/external_services/comdb2#comdb2-identities), the Platform mounts configuration files directly into your job's environment so that comdb2 and related client libraries can transparently access the appropriate databases.

## Using comdb2 on DSP

You can use the python-comdb2 client to query comdb2 databases in various tiers. Interactive access to comdb2 is only possible in dev. Non-interactive access can be enabled for any tier when run in an appropriate production cluster in DSP with a production or bridged network policy. [For more details check the comdb2 row in our service accessibility documentation in Tutti](https://tutti.prod.bloomberg.com/data-science-platform/external_services/index#available-services).

In [1]:
from comdb2 import dbapi2

### Setting a DB name

In [2]:
db_name = "nanadb"

### Setting a SQL Query

In [3]:
import pandas as pd 
from datetime import datetime, date
connection = dbapi2.connect(db_name, autocommit=True, tier="dev")

In [24]:
# ! pip install beautifulsoup4

In [None]:
# Setting an appropriate SQL query
num_rows_to_fetch = 5000
offset = 0
fetched_dfs = []
date_cutoff = date(2021, 6, 1)

sql_query = f"""
    SELECT *
    FROM 'release_type' RT
    join release R ON R.release_type = RT.release_type
    join story S ON S.release_id = R.id
    WHERE wire_id >0 and wire_id NOT IN (25,2345,96,3543,584,474,1719,3447,586,587,97,2640)
    AND CAST(R.toa AS DATE) > CAST('2021-01-01' AS DATE)
"""

for idx, df in enumerate(pd.read_sql(sql_query, con=connection, chunksize=num_rows_to_fetch)):
    df.to_csv(f'one-fetch-{idx}.csv')
    if idx % 5 == 0:
        print(f'fetched {idx} rows...')

In [5]:
# Setting an appropriate SQL query

num_rows_to_fetch = 1000
offset = 0
fetched_dfs = []
date_cutoff = date(2021, 6, 1)

idx = 0
while True:
    sql_query = f"""
        SELECT * FROM 'release_type' RT
        join release R on R.release_type = RT.release_type
        WHERE wire_id >0 and wire_id NOT IN (25,2345,96,3543,584,474,1719,3447,586,587,97,2640)
        ORDER BY toa desc
        limit {num_rows_to_fetch}
        offset {offset}
    """
    
    df = pd.read_sql(sql_query, con=connection)
    num_records = len(df['toa'].loc[lambda s: s.dt.date > date_cutoff])
    if num_records == 0:
        break 
    df.to_csv(f'one-fetch-{idx}.csv')
    if idx % 5 == 0:
        print(f'fetched {idx} rows...')
    offset += num_rows_to_fetch
    idx += 1

fetched 0 rows...
fetched 5 rows...
fetched 10 rows...
fetched 15 rows...
fetched 20 rows...
fetched 25 rows...
fetched 30 rows...
fetched 35 rows...
fetched 40 rows...
fetched 45 rows...
fetched 50 rows...
fetched 55 rows...
fetched 60 rows...
fetched 65 rows...
fetched 70 rows...
fetched 75 rows...
fetched 80 rows...
fetched 85 rows...
fetched 90 rows...
fetched 95 rows...
fetched 100 rows...
fetched 105 rows...
fetched 110 rows...
fetched 115 rows...
fetched 120 rows...
fetched 125 rows...
fetched 130 rows...
fetched 135 rows...
fetched 140 rows...
fetched 145 rows...
fetched 150 rows...
fetched 155 rows...
fetched 160 rows...
fetched 165 rows...
fetched 170 rows...
fetched 175 rows...
fetched 180 rows...
fetched 185 rows...
fetched 190 rows...
fetched 195 rows...
fetched 200 rows...
fetched 205 rows...
fetched 210 rows...
fetched 215 rows...
fetched 220 rows...
fetched 225 rows...
fetched 230 rows...
fetched 235 rows...
fetched 240 rows...
fetched 245 rows...


In [64]:
from tqdm.auto import tqdm

In [None]:
import glob
from bs4 import BeautifulSoup
import re 

fetched_files = glob.glob('one-fetch-*.csv')
fetched_files = sorted(fetched_files, key=lambda x: int(re.search('-(\d+).csv', x)[1]))

processed_dfs = []
dump_every = 5
proc_num = 0
for i, f in tqdm(enumerate(fetched_files)):
    df = pd.read_csv(f, index_col=0)
    df_proc = (
        df
         .loc[lambda df: df['language'] == 'en']
         .loc[lambda df: df['body'].fillna('').str.strip() != '']
         [['toa', 'web_url', 'subject', 'body']]
    )

    df_proc['processed_body'] = df_proc['body'].apply(lambda x: BeautifulSoup(x).get_text().strip())
    df_proc = (df_proc
         .drop_duplicates('processed_body')
         .drop(columns='body')
    )
    processed_dfs.append(df_proc)
    if i % dump_every == 0:
        print('dumping...')
        pd.concat(processed_dfs).to_csv(f'processed-nana-{proc_num}.csv')
        proc_num += 1
        processed_dfs = []

In [6]:
import glob
import pandas  as pd
import re 
from tqdm.auto import tqdm 


processed_files = glob.glob('processed-nana-*.csv')
processed_files = sorted(processed_files, key=lambda x: int(re.search('-(\d+).csv', x)[1]))

all_processed_dfs = []
cutoff = 25
num_combined = 0
for idx, p_filename in tqdm(enumerate(processed_files)):
    all_processed_dfs.append(pd.read_csv(p_filename, index_col=0))
    if idx % cutoff == 0:
        if len(all_processed_dfs) > 1:
            pd.concat(all_processed_dfs).to_csv(
                f'big-processed-nana-file-{num_combined}.csv.gz', compression='gzip'
            )
            num_combined +=1
            all_processed_dfs = []
            
pd.concat(all_processed_dfs).to_csv(
    f'big-processed-nana-file-{num_combined}.csv.gz', compression='gzip'
)            

50it [02:16,  2.72s/it]


In [7]:
len(all_processed_dfs)

24