##### To Do
-read 5th block of code for comments in concerns with card text length organization

-if card is found to not be working properly, manually feed it a working example as this will
    make the model capable of translating the card not working and other similar cards
    
-decicde whether to make common abilities (like SCRY, which is similar to convoke in text structure) 
    something that should be included in card code or in the game engine 

-check if adding other card information (like toughness, power, etc.) beyond name and text was negatively manipulating card output
    
-try and join the GPT4 card converter to the game engine for automated card checking

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import openai
import numpy as np


# set API key
API_KEY = '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'
openai.api_key = API_KEY

# set model
model_id = 'gpt-4'

pd.set_option('display.max_rows', 10)

RANDOM_SEED = 42


In [26]:
# load gpt model
def gpt4_conversation(conversation_log):
    response = openai.ChatCompletion.create(
        model=model_id,
        messages=conversation_log
    )
    conversation_log.append({
        'role': response.choices[0].message.role,
        'content': response.choices[0].message.content.strip()
    })
    return conversation_log



In [27]:
''' prepare dataframes '''

# load all cards (of sorcery in this case, but you can select card type)
df = pd.read_excel('sorcery.xlsx')
df = df.drop(['Unnamed: 0'], axis=1)

# drop rows without code
df['code'] = df['code'].fillna(np.nan)


# disregard cards without gold label code
gold = df.dropna(subset=['code'])
gold = gold.reset_index()
gold = gold.drop(['index'], axis=1)

# verify card content: should have name, text, and code
gold[:5]

Unnamed: 0,name,text,code
0,Essence Drain,Essence Drain deals 3 damage to any target and...,##############################\nEssence Drain\...
1,Volcanic Hammer,Volcanic Hammer deals 3 damage to any target.,##############################\nVolcanic Hamme...
2,Monstrous Growth,Target creature gets +4/+4 until end of turn.,##############################\nMonstrous Grow...
3,Cinder Storm,Cinder Storm deals 7 damage to any target.,##############################\nCinder Storm\n...
4,Sweltering Suns,Sweltering Suns deals 3 damage to each creatur...,##############################\nSwelter\n\n\tT...


In [28]:
''' vectorizing gold cards with tfidf '''

tfidf = TfidfVectorizer()
corpus = list(gold['text'])
train_vectors = tfidf.fit_transform(corpus)
gold = pd.concat([gold, pd.DataFrame(train_vectors, columns=['tfidf vectors'])], axis=1)

# verify vectorizing was success
gold[:5]


Unnamed: 0,name,text,code,tfidf vectors
0,Essence Drain,Essence Drain deals 3 damage to any target and...,##############################\nEssence Drain\...,"(0, 58)\t0.268952488206323\n (0, 50)\t0.279..."
1,Volcanic Hammer,Volcanic Hammer deals 3 damage to any target.,##############################\nVolcanic Hamme...,"(0, 54)\t0.6100526992251465\n (0, 96)\t0.61..."
2,Monstrous Growth,Target creature gets +4/+4 until end of turn.,##############################\nMonstrous Grow...,"(0, 92)\t0.4296458698080389\n (0, 60)\t0.39..."
3,Cinder Storm,Cinder Storm deals 7 damage to any target.,##############################\nCinder Storm\n...,"(0, 80)\t0.6100526992251465\n (0, 16)\t0.61..."
4,Sweltering Suns,Sweltering Suns deals 3 damage to each creatur...,##############################\nSwelter\n\n\tT...,"(0, 34)\t0.2619259085995742\n (0, 12)\t0.52..."


In [29]:
''' get vectors for test cards '''

# only interested in sorcery cards for this example, but replace with interested card type
test = pd.read_csv('sorcery_no_code.csv')


corpus = list(test['text'])

test_vectors = tfidf.transform(corpus)
test = pd.concat([test, pd.DataFrame(test_vectors, columns=['tfidf vectors'])], axis=1)


# resesting cards in linear ascending order of text length 

# IMPORTANT NOTE:
# KNOW THAT METHOD OF REORGANIZING CARDS IN ASCENDING TEXT ORDER IS NOT FULLY WORKING RIGHT NOW
# IN MOST CASES CARDS WERE ORGANZIED PROPERLY BUT SOME WITH SHORTER LENGTHS WERE NOT PLACED PROPERLY IN THE EXCEL FILE
# FINDING A BETTER WAY TO ORGANIZE THE CARDS VIA TEXT LENGTH MAY HELP WITH FASTER CARD OUTPUT ANALYSIS



test2 = test.text.str.len().sort_values(ascending=True).index
test = test.reindex(test2)
test = test.reset_index()
test = test.drop(columns=['Unnamed: 0'])
test = test.drop(columns=['index'])
test = test[2:]
test = test.reset_index()
test = test.drop(labels=[9], axis=0)
test = test.reset_index()
test = test.drop(columns=['level_0'])
test = test.drop(columns=['index'])


test = test[875:900]

In [30]:
# number of cards to use in prompt as examples
NUM_EXAMPLES = 10
prompts = []

# finds NUM_EXAMPLE cards that are most related by using cosine similarity
for test_index, test_row in test.iterrows():

    # get cosine similarity score relative to test card
    curr_test_vector = test_row['tfidf vectors']
    cosine_sim = cosine_similarity(curr_test_vector, train_vectors)


    # appends the cosine similarity score relative to test card
    cosine_sim = pd.DataFrame({'cosine sim scores': cosine_sim[0]})
    gold_with_scores = pd.concat([gold, cosine_sim], axis=1).sort_values(by=['cosine sim scores'], ascending=False)


    examples = []
    examples.append(gold_with_scores.iloc[0])

    # finds cards that are most related to cards of interest in descending order using cosine similarity
    for gold_index, gold_row in gold_with_scores.iterrows():
        if len(examples) == NUM_EXAMPLES:
            break

        # not eniterly sure what this flag loop does (ask other Ben)
        flag = False
        for e in examples:
            if cosine_similarity(e['tfidf vectors'], gold_row['tfidf vectors'])[0][0] > 0.9:
                flag = True

        if flag == True:
            continue
        else:
            examples.append(gold_row)
    
    prompt_str = ''
    for e in examples:
        prompt_str += '# name: {}\n'.format(e['name'])

        # remove some card info as it may be affecting model result

        #prompt_str += '# power: {}\n'.format(e['power'])
        #prompt_str += '# toughness: {}\n'.format(e['toughness'])
        #prompt_str += '# cost: {}\n'.format(e['cost'])
        #prompt_str += '# type: {}\n'.format(e['type'])
        prompt_str += '# effect: {}\n'.format(e['text'])
        prompt_str += '# Generate the code for {}:\n{}\n\n'.format(e['name'], e['code'])
    


    prompt_str += '# name: {}\n'.format(test_row['name'])

    # remove some card info as it may be affecting model result
    
    #prompt_str += '# power: {}\n'.format(test_row['power'])
    #prompt_str += '# toughness: {}\n'.format(test_row['toughness'])
    #prompt_str += '# cost: {}\n'.format(test_row['cost'])
    #prompt_str += '# type: {}\n'.format(test_row['type'])
    prompt_str += '# effect: {}\n'.format(test_row['text'])
    prompt_str += '# Generate the code for {}:\n'.format(test_row['name'])
    prompts.append(prompt_str)

In [31]:
outputs = []
for p in prompts:
    conversations = []
    conversations.append({'role': 'user', 'content': p})
    conversations = gpt4_conversation(conversations)
    outputs.append(conversations[1]['content']) 

In [32]:
# output GPT code in file for analysis
print(len(outputs))
file1 = open("tfidf_output_S875-900.txt", "a")
for item in outputs:
    file1.write(item)
    file1.write("\n")
file1.close()


25
