In [1]:
# For nicer chat output 
from IPython.display import display, Markdown

In [2]:
# To get OpenAI API key

from src.paths import PARENT_DIR
from dotenv import load_dotenv
import os 
import openai 

load_dotenv(PARENT_DIR / '.env')

True

In [3]:
openai.api_key = os.environ['OPENAI_API_KEY']

#### Test connection

3 Roles in chat completion 
https://platform.openai.com/docs/guides/text-generation/chat-completions-api

1) System - Optional, to tell the assistant how to behave 
2) User - you! i.e. the end user 
3) Assistant - whatever model you've selected

In [5]:
# Test chat message 

pilot_message = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages = [
        {"role": "user",
                "content": "Hello, just testing if this api connection is working ok! Can you confirm you're receiving this message?"}
    ]
)

In [6]:
# See full output
pilot_message

<OpenAIObject chat.completion id=chatcmpl-98E9XHSpPik5nKtFlm8lJhTYwVqVt at 0x1287c268> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Hello! Yes, I can confirm that I have received your message. Is there anything else you would like to test or ask about?",
        "role": "assistant"
      }
    }
  ],
  "created": 1711748055,
  "id": "chatcmpl-98E9XHSpPik5nKtFlm8lJhTYwVqVt",
  "model": "gpt-3.5-turbo-0125",
  "object": "chat.completion",
  "system_fingerprint": "fp_3bc1b5746c",
  "usage": {
    "completion_tokens": 27,
    "prompt_tokens": 28,
    "total_tokens": 55
  }
}

In [12]:
# See reply only ! 

pilot_message["choices"][0]["message"]["content"]

'Hello! Yes, I can confirm that I have received your message. Is there anything else you would like to test or ask about?'

#### Using langchain - legacy callback

[Legacy_calls](https://python.langchain.com/docs/modules/model_io/chat/quick_start#legacy-__call__)

Using langchain, the code becomes more 'tidy', when we want to extract messages from the assistant. 

In [3]:
# new langchain version 

from langchain_openai import ChatOpenAI

In [4]:
# Create a ChatOpenAI object. Assign to chat.
chat = ChatOpenAI(model='gpt-3.5-turbo')


In [5]:
import langchain as lc 

from langchain.chat_models import ChatOpenAI

from langchain.schema import AIMessage, HumanMessage, SystemMessage

# HumanMessage is like openAI's 'user' role, and systemMessage is like 'system' role 

Below we define : 

- system_message : Telling the assistant what context it's being used in, this will shape it's replies

- data_brief : Explaining the data being analysed

- task_request : Asking specific question 

In [6]:
system_message = "You are an analyst working for the government, tasked with analysing smoking behaviours, with the aim of creating government policy to decrease smoking."

data_brief = """
We have some survey data on smoking habits from the UK. The data set can be used for analyzing the demographic characteristics of smokers and types of tobacco consumed.

A data frame with 1691 observations on the following 12 variables: 

1) 'gender' (string) : Gender with levels Female and Male.
2) 'age' (numeric) : Age
3) 'marital_status' (string) : Marital status with levels Divorced, Married, Separated, Single and Widowed.
4) 'highest_qualification' (string) : Highest education level with levels A Levels, Degree, GCSE/CSE, GCSE/O Level, Higher/Sub Degree, No Qualification, ONC/BTEC and Other/Sub Degree
5) 'nationality' (string) : Nationality with levels British, English, Irish, Scottish, Welsh, Other, Refused and Unknown.
6) 'ethnicity' (string) : Ethnicity with levels Asian, Black, Chinese, Mixed, White and Refused Unknown.
7) 'gross_income' (string) : Gross income with levels Under 2,600, 2,600 to 5,200, 5,200 to 10,400, 10,400 to 15,600, 15,600 to 20,800, 20,800 to 28,600, 28,600 to 36,400, Above 36,400, Refused and Unknown.
8) 'region' (string) : Region with levels London, Midlands & East Anglia, Scotland, South East, South West, The North and Wales
9) 'smoke' (boolean) : Smoking status with levels No and Yes
10) 'amt_weekends' (integer) : Number of cigarettes smoked per day on weekends.
11) 'amt_weekdays' (integer) : Number of cigarettes smoked per day on weekdays.
12) 'type' (string) : Type of cigarettes smoked with levels Packets, Hand-Rolled, Both/Mainly Packets and Both/Mainly Hand-Rolled 
"""

task_request = "Suggest some data analysis questions we could get answers from this data."


initial_message = [
    SystemMessage(content = system_message),
    HumanMessage(content=f"{data_brief}\n\n{task_request}")
]

In [8]:
# Create a ChatOpenAI object. Assign to chat.
chat = ChatOpenAI(model='gpt-3.5-turbo')

# Pass your message to GPT. Assign to rsps_suggest_questions.
intial_response = chat(initial_message)

In [11]:
display(Markdown(intial_response.content))

Here are some data analysis questions you could explore using the provided dataset on smoking habits in the UK:

1. What is the overall prevalence of smoking among the surveyed population?
2. How does smoking prevalence vary by gender?
3. Are there significant differences in smoking prevalence across different age groups?
4. Does marital status have an impact on smoking behaviour?
5. Is there a relationship between the highest level of education attained and smoking status?
6. How does nationality influence smoking habits?
7. Are there differences in smoking prevalence among various ethnic groups?
8. Does gross income level correlate with smoking behaviour?
9. Which region has the highest and lowest rates of smoking?
10. What is the average number of cigarettes smoked per day on weekends and weekdays?
11. What are the most commonly smoked types of cigarettes among smokers?
12. Is there a correlation between the type of cigarettes smoked and the amount smoked per day?

By analyzing these questions, you can gain insights into the demographic characteristics of smokers, patterns of tobacco consumption, and potential factors influencing smoking behaviours. These insights can inform the development of targeted government policies aimed at reducing smoking rates in the UK.