In [9]:
import pandas as pd
import numpy as np

In [10]:
def load_dataset(path):
    df = pd.read_json(path).T
    df['text'] = df['text'].astype(str).fillna('text')
    df['manaValue'] = df['manaValue'].astype(str).fillna('manaValue')
    df['toughness'] = df['toughness'].astype(str).fillna('toughness')
    df['power'] = df['power'].astype(str).fillna('power')
    df['type'] = df['type'].astype(str).fillna('type')
    df.rename(columns={'index': 'card_name'}, inplace=True)
    return df

In [11]:
filepath = 'data_download/MyDataMTGv2.json'
df = load_dataset(filepath)
df

Unnamed: 0,colorIdentity,colors,convertedManaCost,edhrecRank,isStarter,manaCost,manaValue,name,power,rarity,...,attractionLights,isFunny,asciiName,flavorName,defense,faceFlavorName,hasAlternativeDeckLimit,cardParts,isOversized,isTextless
Tukatongue Thallid,[G],[G],1.0,9980,True,{G},1.0,0263d148-6b69-552f-aa8f-0e0a828a52c7,1,common,...,,,,,,,,,,
Moriok Replica,[B],[],3.0,13335,,{3},3.0,a9a8ce3a-2d95-566c-b3d3-d7ada7cf2ad6,2,common,...,,,,,,,,,,
Faerie Mechanist,[U],[U],4.0,14051,True,{3}{U},4.0,209175f2-8a24-5b0f-92fc-dc572dd7cbca,2,common,...,,,,,,,,,,
Ilysian Caryatid,[G],[G],2.0,1393,,{1}{G},2.0,3b4223be-41a3-5adb-b9ee-5e871101eb30,1,common,...,,,,,,,,,,
Centaur Omenreader,[G],[G],4.0,11696,,{3}{G},4.0,f0cade63-ec0f-5182-b089-24d93faef530,3,uncommon,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hellfire Mongrel,[R],[R],3.0,18730,,{2}{R},3.0,96fc59f6-41ce-5b8f-b94e-d1bb8f7d79ba,2,uncommon,...,,,,,,,,,,
Corpse Explosion,"[B, R]","[B, R]",3.0,8501,,{1}{B}{R},3.0,08fe876f-cc69-5e66-8bfe-695074ba2fcc,,rare,...,,,,,,,,,,
Mogg Squad,[R],[R],2.0,25802,,{1}{R},2.0,12e22e5e-d947-5039-b519-55138cf99ba0,3,uncommon,...,,,,,,,,,,
Roots of Wisdom,[G],[G],2.0,9293,,{1}{G},2.0,251de176-9a6a-5e83-a862-b24a3ca8a4bb,,common,...,,,,,,,,,,


In [12]:
color_map = {
    "G": "green",
    "B": "black",
    "R": "red",
    "U": "blue",
    "W": "white",
    "C": "colorless"
}

df['colorIdentity'] = df['colorIdentity'].apply(lambda colors: [color_map[color] for color in colors])
df

Unnamed: 0,colorIdentity,colors,convertedManaCost,edhrecRank,isStarter,manaCost,manaValue,name,power,rarity,...,attractionLights,isFunny,asciiName,flavorName,defense,faceFlavorName,hasAlternativeDeckLimit,cardParts,isOversized,isTextless
Tukatongue Thallid,[green],[G],1.0,9980,True,{G},1.0,0263d148-6b69-552f-aa8f-0e0a828a52c7,1,common,...,,,,,,,,,,
Moriok Replica,[black],[],3.0,13335,,{3},3.0,a9a8ce3a-2d95-566c-b3d3-d7ada7cf2ad6,2,common,...,,,,,,,,,,
Faerie Mechanist,[blue],[U],4.0,14051,True,{3}{U},4.0,209175f2-8a24-5b0f-92fc-dc572dd7cbca,2,common,...,,,,,,,,,,
Ilysian Caryatid,[green],[G],2.0,1393,,{1}{G},2.0,3b4223be-41a3-5adb-b9ee-5e871101eb30,1,common,...,,,,,,,,,,
Centaur Omenreader,[green],[G],4.0,11696,,{3}{G},4.0,f0cade63-ec0f-5182-b089-24d93faef530,3,uncommon,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hellfire Mongrel,[red],[R],3.0,18730,,{2}{R},3.0,96fc59f6-41ce-5b8f-b94e-d1bb8f7d79ba,2,uncommon,...,,,,,,,,,,
Corpse Explosion,"[black, red]","[B, R]",3.0,8501,,{1}{B}{R},3.0,08fe876f-cc69-5e66-8bfe-695074ba2fcc,,rare,...,,,,,,,,,,
Mogg Squad,[red],[R],2.0,25802,,{1}{R},2.0,12e22e5e-d947-5039-b519-55138cf99ba0,3,uncommon,...,,,,,,,,,,
Roots of Wisdom,[green],[G],2.0,9293,,{1}{G},2.0,251de176-9a6a-5e83-a862-b24a3ca8a4bb,,common,...,,,,,,,,,,


In [13]:
import re

number_words = {
    "0": "zero",
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine",
    "10": "ten",
    "11": "eleven",
    "12": "twelve",
    "13": "thirteen",
    "14": "fourteen",
    "15": "fifteen",
    "16": "sixteen",
    "17": "seventeen",
    "18": "eighteen",
    "19": "nineteen",
    "20": "twenty",
    "21": "twentyone",
    "22": "twentytwo",
    "23": "twentythree",
    "24": "twentyfour",
    "25": "twentyfive"
}

def convert_mana_cost(mana_string):
    # Match each symbol enclosed in braces (e.g., {G}, {3})
    if pd.isna(mana_string):
        return "zero"
    
    symbols = re.findall(r'\{(.*?)\}', mana_string)
    converted_symbols = []
    
    for symbol in symbols:
        if symbol.isdigit():  # If the symbol is a digit, get its word and add "colorless"
            converted_symbols.append(f"{number_words[symbol]} colorless")
        elif symbol in color_map:  # If the symbol is in color_map, replace it with the corresponding color
            converted_symbols.append(color_map[symbol])
    
    # Join the converted symbols with spaces
    return " ".join(converted_symbols)

df['manaCost'] = df['manaCost'].apply(convert_mana_cost)
df

Unnamed: 0,colorIdentity,colors,convertedManaCost,edhrecRank,isStarter,manaCost,manaValue,name,power,rarity,...,attractionLights,isFunny,asciiName,flavorName,defense,faceFlavorName,hasAlternativeDeckLimit,cardParts,isOversized,isTextless
Tukatongue Thallid,[green],[G],1.0,9980,True,green,1.0,0263d148-6b69-552f-aa8f-0e0a828a52c7,1,common,...,,,,,,,,,,
Moriok Replica,[black],[],3.0,13335,,three colorless,3.0,a9a8ce3a-2d95-566c-b3d3-d7ada7cf2ad6,2,common,...,,,,,,,,,,
Faerie Mechanist,[blue],[U],4.0,14051,True,three colorless blue,4.0,209175f2-8a24-5b0f-92fc-dc572dd7cbca,2,common,...,,,,,,,,,,
Ilysian Caryatid,[green],[G],2.0,1393,,one colorless green,2.0,3b4223be-41a3-5adb-b9ee-5e871101eb30,1,common,...,,,,,,,,,,
Centaur Omenreader,[green],[G],4.0,11696,,three colorless green,4.0,f0cade63-ec0f-5182-b089-24d93faef530,3,uncommon,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hellfire Mongrel,[red],[R],3.0,18730,,two colorless red,3.0,96fc59f6-41ce-5b8f-b94e-d1bb8f7d79ba,2,uncommon,...,,,,,,,,,,
Corpse Explosion,"[black, red]","[B, R]",3.0,8501,,one colorless black red,3.0,08fe876f-cc69-5e66-8bfe-695074ba2fcc,,rare,...,,,,,,,,,,
Mogg Squad,[red],[R],2.0,25802,,one colorless red,2.0,12e22e5e-d947-5039-b519-55138cf99ba0,3,uncommon,...,,,,,,,,,,
Roots of Wisdom,[green],[G],2.0,9293,,one colorless green,2.0,251de176-9a6a-5e83-a862-b24a3ca8a4bb,,common,...,,,,,,,,,,


In [14]:
df['power'] = df['power'].apply(lambda x: number_words.get(x, x))
df['toughness'] = df['toughness'].apply(lambda x: number_words.get(x, x))
df

Unnamed: 0,colorIdentity,colors,convertedManaCost,edhrecRank,isStarter,manaCost,manaValue,name,power,rarity,...,attractionLights,isFunny,asciiName,flavorName,defense,faceFlavorName,hasAlternativeDeckLimit,cardParts,isOversized,isTextless
Tukatongue Thallid,[green],[G],1.0,9980,True,green,1.0,0263d148-6b69-552f-aa8f-0e0a828a52c7,one,common,...,,,,,,,,,,
Moriok Replica,[black],[],3.0,13335,,three colorless,3.0,a9a8ce3a-2d95-566c-b3d3-d7ada7cf2ad6,two,common,...,,,,,,,,,,
Faerie Mechanist,[blue],[U],4.0,14051,True,three colorless blue,4.0,209175f2-8a24-5b0f-92fc-dc572dd7cbca,two,common,...,,,,,,,,,,
Ilysian Caryatid,[green],[G],2.0,1393,,one colorless green,2.0,3b4223be-41a3-5adb-b9ee-5e871101eb30,one,common,...,,,,,,,,,,
Centaur Omenreader,[green],[G],4.0,11696,,three colorless green,4.0,f0cade63-ec0f-5182-b089-24d93faef530,three,uncommon,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hellfire Mongrel,[red],[R],3.0,18730,,two colorless red,3.0,96fc59f6-41ce-5b8f-b94e-d1bb8f7d79ba,two,uncommon,...,,,,,,,,,,
Corpse Explosion,"[black, red]","[B, R]",3.0,8501,,one colorless black red,3.0,08fe876f-cc69-5e66-8bfe-695074ba2fcc,,rare,...,,,,,,,,,,
Mogg Squad,[red],[R],2.0,25802,,one colorless red,2.0,12e22e5e-d947-5039-b519-55138cf99ba0,three,uncommon,...,,,,,,,,,,
Roots of Wisdom,[green],[G],2.0,9293,,one colorless green,2.0,251de176-9a6a-5e83-a862-b24a3ca8a4bb,,common,...,,,,,,,,,,


In [15]:
def replace_symbols(text):
    # Step 1: Replace {T} with "tap"
    text = text.replace("{T}", "tap")
    
    # Step 2: Replace {C}, {W}, {U}, etc., with corresponding color names
    for symbol, color in color_map.items():
        text = text.replace(f"{{{symbol}}}", color)
    
    # Step 3: Replace numbers in braces, like {1}, {2}, with word equivalents
    text = re.sub(r'\{(\d+)\}', lambda x: f"{number_words.get(x.group(1), x.group(1))} colorless ", text)

    # Step 4: Replace standalone numbers (e.g., 1/1) with word equivalents
    text = re.sub(r'(?<!\{)(\b\d+\b)', lambda x: number_words.get(x.group(1), x.group(1)), text)

    # Step 5: Replace "+" and "-" with "plus" and "minus"
    text = text.replace("+", "plus ").replace("-", "minus ")
    
    # Step 6: Replace "/" with "by"
    text = text.replace("/", " by ")
    
    return text

df['text'] = df['text'].apply(replace_symbols)
df['text']

Tukatongue Thallid    When Tukatongue Thallid dies, create a one by ...
Moriok Replica        one colorless black, Sacrifice Moriok Replica:...
Faerie Mechanist      Flying\nWhen Faerie Mechanist enters the battl...
Ilysian Caryatid      tap: Add one mana of any color. If you control...
Centaur Omenreader    As long as Centaur Omenreader is tapped, creat...
                                            ...                        
Hellfire Mongrel      At the beginning of each opponent's upkeep, if...
Corpse Explosion      As an additional cost to cast this spell, exil...
Mogg Squad            Mogg Squad gets minus one by minus one for eac...
Roots of Wisdom       Mill three cards, then return a land card or E...
Raven Guild Master    Whenever Raven Guild Master deals combat damag...
Name: text, Length: 26143, dtype: object

In [16]:
df['card_name'] = df.index
df.to_csv('data_download/clean_df.csv', index=False)
df

Unnamed: 0,colorIdentity,colors,convertedManaCost,edhrecRank,isStarter,manaCost,manaValue,name,power,rarity,...,isFunny,asciiName,flavorName,defense,faceFlavorName,hasAlternativeDeckLimit,cardParts,isOversized,isTextless,card_name
Tukatongue Thallid,[green],[G],1.0,9980,True,green,1.0,0263d148-6b69-552f-aa8f-0e0a828a52c7,one,common,...,,,,,,,,,,Tukatongue Thallid
Moriok Replica,[black],[],3.0,13335,,three colorless,3.0,a9a8ce3a-2d95-566c-b3d3-d7ada7cf2ad6,two,common,...,,,,,,,,,,Moriok Replica
Faerie Mechanist,[blue],[U],4.0,14051,True,three colorless blue,4.0,209175f2-8a24-5b0f-92fc-dc572dd7cbca,two,common,...,,,,,,,,,,Faerie Mechanist
Ilysian Caryatid,[green],[G],2.0,1393,,one colorless green,2.0,3b4223be-41a3-5adb-b9ee-5e871101eb30,one,common,...,,,,,,,,,,Ilysian Caryatid
Centaur Omenreader,[green],[G],4.0,11696,,three colorless green,4.0,f0cade63-ec0f-5182-b089-24d93faef530,three,uncommon,...,,,,,,,,,,Centaur Omenreader
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hellfire Mongrel,[red],[R],3.0,18730,,two colorless red,3.0,96fc59f6-41ce-5b8f-b94e-d1bb8f7d79ba,two,uncommon,...,,,,,,,,,,Hellfire Mongrel
Corpse Explosion,"[black, red]","[B, R]",3.0,8501,,one colorless black red,3.0,08fe876f-cc69-5e66-8bfe-695074ba2fcc,,rare,...,,,,,,,,,,Corpse Explosion
Mogg Squad,[red],[R],2.0,25802,,one colorless red,2.0,12e22e5e-d947-5039-b519-55138cf99ba0,three,uncommon,...,,,,,,,,,,Mogg Squad
Roots of Wisdom,[green],[G],2.0,9293,,one colorless green,2.0,251de176-9a6a-5e83-a862-b24a3ca8a4bb,,common,...,,,,,,,,,,Roots of Wisdom


In [17]:
df_new = pd.read_csv('data_download/clean_df.csv')
df_new

  df_new = pd.read_csv('data_download/clean_df.csv')


Unnamed: 0,colorIdentity,colors,convertedManaCost,edhrecRank,isStarter,manaCost,manaValue,name,power,rarity,...,isFunny,asciiName,flavorName,defense,faceFlavorName,hasAlternativeDeckLimit,cardParts,isOversized,isTextless,card_name
0,['green'],['G'],1.0,9980.0,True,green,1.0,0263d148-6b69-552f-aa8f-0e0a828a52c7,one,common,...,,,,,,,,,,Tukatongue Thallid
1,['black'],[],3.0,13335.0,,three colorless,3.0,a9a8ce3a-2d95-566c-b3d3-d7ada7cf2ad6,two,common,...,,,,,,,,,,Moriok Replica
2,['blue'],['U'],4.0,14051.0,True,three colorless blue,4.0,209175f2-8a24-5b0f-92fc-dc572dd7cbca,two,common,...,,,,,,,,,,Faerie Mechanist
3,['green'],['G'],2.0,1393.0,,one colorless green,2.0,3b4223be-41a3-5adb-b9ee-5e871101eb30,one,common,...,,,,,,,,,,Ilysian Caryatid
4,['green'],['G'],4.0,11696.0,,three colorless green,4.0,f0cade63-ec0f-5182-b089-24d93faef530,three,uncommon,...,,,,,,,,,,Centaur Omenreader
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26138,['red'],['R'],3.0,18730.0,,two colorless red,3.0,96fc59f6-41ce-5b8f-b94e-d1bb8f7d79ba,two,uncommon,...,,,,,,,,,,Hellfire Mongrel
26139,"['black', 'red']","['B', 'R']",3.0,8501.0,,one colorless black red,3.0,08fe876f-cc69-5e66-8bfe-695074ba2fcc,,rare,...,,,,,,,,,,Corpse Explosion
26140,['red'],['R'],2.0,25802.0,,one colorless red,2.0,12e22e5e-d947-5039-b519-55138cf99ba0,three,uncommon,...,,,,,,,,,,Mogg Squad
26141,['green'],['G'],2.0,9293.0,,one colorless green,2.0,251de176-9a6a-5e83-a862-b24a3ca8a4bb,,common,...,,,,,,,,,,Roots of Wisdom


In [20]:
def get_names_descriptions(df):
    df['description'] = df['text'] + ' cost ' + df['manaCost'] + ' toughness ' + df['toughness'] + ' power ' + df['power'] + ' types ' + df['type']
    return df['card_name'].values, df['description'].values

card_names, card_descriptions = get_names_descriptions(df_new)

card_names, card_descriptions

(array(['Tukatongue Thallid', 'Moriok Replica', 'Faerie Mechanist', ...,
        'Mogg Squad', 'Roots of Wisdom', 'Raven Guild Master'],
       dtype=object),
 array(['When Tukatongue Thallid dies, create a one by one green Saproling creature token. cost green toughness one power one types Creature — Fungus',
        'one colorless black, Sacrifice Moriok Replica: You draw two cards and you lose two life. cost three colorless toughness two power two types Artifact Creature — Warrior',
        'Flying\nWhen Faerie Mechanist enters the battlefield, look at the top three cards of your library. You may reveal an artifact card from among them and put it into your hand. Put the rest on the bottom of your library in any order. cost three colorless blue toughness two power two types Artifact Creature — Faerie Artificer',
        ...,
        'Mogg Squad gets minus one by minus one for each other creature on the battlefield. cost one colorless red toughness three power three types Creature — Go