# Trying OpenAI to derive themes

I created an OpenAI account.

Followed installation instructions from here https://platform.openai.com/docs/quickstart (in a different directory)

create a conda env called openai
```
conda create --name openai python=3.10 jupyter pandas
conda activate openai
pip install -r requirements.txt
```

Used the "playground" online to create a mock code example, copied it and modified it in the notebook.

In [None]:
import pandas as pd

In [None]:
import openai
openai.api_key_path = '.env'

In [None]:
def get_themes(n, text):
    prompt =  f'Suggest {n} themes for the following text.\n\n{text}\n\nthemes:'
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0,
        max_tokens=n*16,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    
    return response.choices[0].text


def summarize_text(text, n=100):
    prompt = f'Summarize the following text in {n} words:\n{text}'
    
    response = openai.Completion.create(
      engine="text-davinci-003", 
      prompt=prompt,
      temperature=0, 
      max_tokens=150,
      top_p=1, 
      frequency_penalty=0,
      presence_penalty=1
    )

    return response.choices[0].text


## Read in the data and combine all the answers into one long string

In [None]:
# full data file with multiple sheets
filename = '../data/ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx'

# sheet name for this analysis, containing responses to one question
#sheet = 'Course Meta SelfEff'
sheet = 'Course Meta App'

df = pd.read_excel(filename, sheet)
answers = df[df.columns[1]].values

## Send through openAI

### This is too much text to send through openAI in one go

Can only send ~4000 "tokens", so I will chunk it up and then do one final theme pass on the list of themes from the chunks.

In [None]:
def get_chunk(text, i0 = 0, i1_start = 10, n = 4000):
    i1 = i1_start
    chunk = ' '.join(answers[i0:i1_start])
    chunk_len = len(chunk.split())*4./3. # apparently this is a rule of thumb for counting tokens (seems to overestimate)
    while (chunk_len > n and i1 > i0):
        i1 -= 1
        chunk = ' '.join(answers[i0:i1])
        chunk_len = len(chunk.split())*4./3. # apparently this is a rule of thumb for counting tokens (seems to overestimate)
    if (i1 == i0):
        return (-1,'ERROR: answer too long', chunk_len)
    else:
        return (i1, chunk, chunk_len)

In [None]:
chunk_size = 50
i0 = 0
i1 = chunk_size
n_themes = 5
all_themes = []
#print(len(answers))
while i1 < len(df):
    # get the chunk
    i1, chunk, chunk_len = get_chunk(answers, i0, i1, n = 4000)
    #print(i0, i1, chunk_len)
    if (i1 == -1):
        break
        
    i0 = i1
    i1 = i0 + chunk_size
    
    # run this chunk through openAI
    themes = get_themes(n_themes, chunk)
    all_themes.append(themes)
    print(themes)

In [None]:
summary_themes = get_themes(5, ' '.join(all_themes))
print(summary_themes)