In [2]:
#!pip install ipykernel
import duckdb
import ibis
import polars as pl
from openai import OpenAI
import json
import pandas as pd

In [16]:
# Connect to your DuckDB database using Ibis
con = ibis.duckdb.connect('samhsa_data.db')
con.list_tables() # List the tables in the db")


['DemographicValueLabelMapping',
 'Demographics',
 'EmploymentDetails',
 'LegalInfo',
 'SubstanceUseHistory',
 'TreatmentInformation',
 'teds_a_raw_2015_2019']

#### Data Ingestion : Creating Polars DataFrame 

We could use two apporaches here. USe a simple ibis connection to excute the select query and turn the resulting pandas frame into polars. 
OR use duckdb connection and arrow to input directly into polars without a panda - polar connversion. Approach 2  appears to be marginally faster for this amount of data. So I would be going ahead with that for the future tables. 


In [4]:
# Ibis method
demographics_pd = con.table('Demographics').execute() #you can add limit(100) before excecute to sample this data
# And then convert to polars 
demo_pl = pl.from_pandas(demographics_pd)


In [5]:
demo_pl.head(2)
#demo_pl.columns

CASEID,ADMYR,AGE,GENDER,RACE,ETHNIC,MARSTAT,EDUC,PREG,VET,STFIPS,CBSA2010,REGION,DIVISION
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
20151524993,2015,6,2,5,3,4,2,1,2,2,-9,4,9
20151449532,2015,7,2,1,4,2,3,2,2,2,-9,4,9


In [6]:
# DuckDB Method to Load database table 
dbcon = duckdb.connect('samhsa_data.db')
demo = pl.from_arrow(dbcon.execute("SELECT * FROM Demographics").arrow())
dbcon.close()

In [7]:
demo.head(2)
#pl_df.columns

CASEID,ADMYR,AGE,GENDER,RACE,ETHNIC,MARSTAT,EDUC,PREG,VET,STFIPS,CBSA2010,REGION,DIVISION
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
20151524993,2015,6,2,5,3,4,2,1,2,2,-9,4,9
20151449532,2015,7,2,1,4,2,3,2,2,2,-9,4,9


In [8]:
# Ingest all the other tables into polars dfs
dbcon = duckdb.connect('samhsa_data.db')
emp_det = pl.from_arrow(dbcon.execute("SELECT * FROM EmploymentDetails").arrow())
legal_info = pl.from_arrow(dbcon.execute("SELECT * FROM LegalInfo").arrow())
subs_hist = pl.from_arrow(dbcon.execute("SELECT * FROM SubstanceUseHistory").arrow())
treat_info = pl.from_arrow(dbcon.execute("SELECT * FROM TreatmentInformation").arrow())
dbcon.close()


In [17]:
dbcon = duckdb.connect('samhsa_data.db')
mapp = pl.from_arrow(dbcon.execute("SELECT * FROM DemographicValueLabelMapping").arrow())
dbcon.close()
mapp.tail(10)


value,label,tablename
i32,str,str
3,"""Cuban or other specific Hispan…","""ETHNIC"""
4,"""Not of Hispanic or Latino orig…","""ETHNIC"""
5,"""Hispanic or Latino, specific o…","""ETHNIC"""
-9,"""Missing/unknown/not collected/…","""ETHNIC"""
1,"""Never married""","""MARSTAT"""
2,"""Now married""","""MARSTAT"""
3,"""Separated""","""MARSTAT"""
4,"""Divorced""","""MARSTAT"""
5,"""Widowed""","""MARSTAT"""
-9,"""Missing/unknown/not collected/…","""MARSTAT"""


In [9]:


# Connect to the DuckDB database
dbcon = duckdb.connect('samhsa_data.db')

# Insert the value-label pairs for the variables on pages 17 to 20
insert_values = '''
INSERT INTO DemographicValueLabelMapping (value, label, tablename) VALUES
-- EMPLOY
(1, 'Full-time', 'EMPLOY'),
(2, 'Part-time', 'EMPLOY'),
(3, 'Unemployed', 'EMPLOY'),
(4, 'Not in labor force', 'EMPLOY'),
(-9, 'Missing/unknown/not collected/invalid', 'EMPLOY'),

-- DETNLF
(1, 'Homemaker', 'DETNLF'),
(2, 'Student', 'DETNLF'),
(3, 'Retired, disabled', 'DETNLF'),
(4, 'Resident of institution', 'DETNLF'),
(5, 'Other', 'DETNLF'),
(-9, 'Missing/unknown/not collected/invalid', 'DETNLF'),

-- PREG
(1, 'Yes', 'PREG'),
(2, 'No', 'PREG'),
(-9, 'Missing/unknown/not collected/invalid', 'PREG'),

-- VET
(1, 'Yes', 'VET'),
(2, 'No', 'VET'),
(-9, 'Missing/unknown/not collected/invalid', 'VET');
'''

# Execute the insert statements
dbcon.execute(insert_values)

# Close the connection
dbcon.close()


In [12]:
# Store your credentials in a json locally | Don't forget to add this file to your gitignore

credentials_path = 'credentials.json'

# Read Credential
with open(credentials_path, 'r') as file:
    credentials = json.load(file)

openai_api_key = credentials['openai_api_key']

# Pass this key to configure OpenAI's API client
client = OpenAI(api_key=openai_api_key)



In [None]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)