In [2]:
RANDOM_SEED=2022
import pandas as pd
import numpy as np
# !pip install fastparquet -q
# !pip install tf-nightly
from tensorflow import keras
from keras.utils import np_utils
from keras import optimizers
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import tensorflow as tf
import string
from sklearn.model_selection import train_test_split
tf.random.set_seed(RANDOM_SEED)
from numpy.random import seed
seed(RANDOM_SEED)
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kuanchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kuanchen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/kuanchen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
#A Colab pro environment should have >20Gb of total memory.
from psutil import virtual_memory
colab_pro = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(colab_pro))

if colab_pro < 20:
  print('Not using a high-RAM runtime')
  # train model with lower settings
else:
  print('You are using a high-RAM runtime!')
  # train model with higher settings

Your runtime has 201.1 gigabytes of available RAM

You are using a high-RAM runtime!


### Data Prep

In [4]:
#read the file
game_df = pd.read_parquet('https://github.com/canunj/deconstructing_games/blob/main/ranked_df.parquet.gzip?raw=true', engine='fastparquet')

In [38]:
#filter the games from the year 2000 onwards
game_df = game_df[game_df.year>=2000]
display(game_df.shape)

display(game_df.sample(2)[['name','category','description','publisher']])
game_titles=game_df[['name']]
game_titles_train, game_titles_test=train_test_split(game_titles,train_size=0.95,random_state=2022)
print(f'train size:{game_titles_train.shape}, test size:{game_titles_test.shape},')

(17289, 33244)

Unnamed: 0,name,category,description,publisher
75641,Clash of Steel: A Tactical Card Game of Mediev...,"['Card Game', 'Fighting', 'Medieval']","A competitive, low-luck, 15-minute card game f...",[Sigil Stone Publishing]
41296,GOSU,"['Card Game', 'Fantasy']","In a fantasy world dominated by goblins, when ...",[Moonster Games]


train size:(16424, 1), test size:(865, 1),


### Preprocessing Text

In [6]:
def text_process(text):
    text=''.join([t for t in text if t not in string.punctuation])
    text=text.encode('utf-8').decode('utf-8','strict')
    return text.lower()

tokenizer=Tokenizer() # make sure to save this tokenizer to be able to load the model
def n_gram_vectors(dictionary):
  # tokenization
    tokenizer.fit_on_texts(dictionary)
    unique_words = len(tokenizer.word_index) + 1
    
  # convert data to token index 
    token_vectors = []
    for text in dictionary:
        tokens = tokenizer.texts_to_sequences([text])[0]
        for i in range(1, len(tokens)):
            n_gram_vector = tokens[:i+1]
            token_vectors.append(n_gram_vector)
    return token_vectors, unique_words


def padded_tokens(token_index):
    max_len = max([len(x) for x in token_index])
    token_index = np.array(pad_sequences(token_index, maxlen=max_len, padding='pre'))
    
    X, y = token_index[:,:-1],token_index[:,-1]
    y = keras.utils.to_categorical(y, num_classes=unique_words)
    return X, y, max_len

In [7]:
dictionary_cleaned=game_titles_train.name.apply(text_process)
token_index,unique_words=n_gram_vectors(dictionary_cleaned)
X, y, max_len = padded_tokens(token_index)

### RNN Model set up and training

#### to load the model

In [119]:
import pickle
generator_model=keras.models.load_model(r'./GeneratorAssets/generative_model1')
with open('Generator_Outputs/Models_Assets/generative_tokenizer_v1.pickle', 'rb') as f:
    tokenizer = pickle.load(f)
    
    
generator_model=keras.models.load_model(r'./GeneratorAssets/generative_model_v7')
with open('generative_tokenizer.pickle', 'rb') as f:
    tokenizer = pickle.load(f)



In [29]:
input_len = max_len - 1
dropout=0.2
output_dim=10
neurons=128*6
epochs=100
batch_size=128*4

# Neural network set up
generator_model = Sequential()
#Input Embedding Layer
generator_model.add(Embedding(unique_words, output_dim=output_dim, input_length=input_len))

#Hidden Layer 1 - LSTM
generator_model.add(LSTM(units=neurons,recurrent_dropout=dropout,return_sequences=True))

# #Hidden Layer 2 - LSTM
# generator_model.add(LSTM(units=neurons,recurrent_dropout=dropout,return_sequences=True))

#Hidden Layer 2 - LSTM
generator_model.add(LSTM(units=neurons,recurrent_dropout=dropout))

lr_schedule=tf.keras.optimizers.Adam(learning_rate=keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=100,
    decay_rate=0.01,
    staircase=True))

#Output Layer
generator_model.add(Dense(unique_words, activation='softmax'))
generator_model.compile(loss='categorical_crossentropy', 
                        optimizer='adam'
#                       lr_schedule
                       )

display(generator_model.summary())

history=generator_model.fit(X,y,epochs=epochs,validation_split=0.05,batch_size=batch_size)









Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 16, 10)            139840    
                                                                 
 lstm_39 (LSTM)              (None, 16, 768)           2393088   
                                                                 
 lstm_40 (LSTM)              (None, 768)               4721664   
                                                                 
 dense_7 (Dense)             (None, 13984)             10753696  
                                                                 
Total params: 18,008,288
Trainable params: 18,008,288
Non-trainable params: 0
_________________________________________________________________


None

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Model evaluation

In [150]:
import plotly.express as px
model_history=pd.DataFrame(history.history).reset_index()
model_history.columns=['TrainingRound','CategoricalCrossEntropyLoss','ValidationLoss']
fig=px.line(model_history,x='TrainingRound',y=model_history.columns[1:])
fig.update_layout(title='Model1 Training losses performance over 100 rounds <br> (recurrent) dropout=0.2, 512 nodes, 2 layers, adam optimizer',
                   xaxis_title='Training Round',
                   yaxis_title='Loss Values')
fig.show()
model_history.to_csv('evaluation/model1_performance.csv',index=False)

## Examine Title Generator Output

In [14]:
def text_generator(start_text, n_next_words, model, max_len):
    for _ in range(n_next_words):
        token_list = tokenizer.texts_to_sequences([start_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        generated = np.argmax(model.predict(token_list), axis=-1)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == generated:
                output_word = word
                break
        start_text += " "+output_word
    return start_text.title()

def generate_title(start_text):
    
    def text_generator(start_text, n_next_words, model, max_len):
        for _ in range(n_next_words):
            token_list = tokenizer.texts_to_sequences([start_text])[0]
            token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
            generated = np.argmax(model.predict(token_list), axis=-1)

            generated_word = ""
            for word,index in tokenizer.word_index.items():
                if index==generated:
                    generated_word = word
                    break
            start_text+=" "+generated_word
        return start_text.title()
    
    start_text=start_text.lower()
    text_split=start_text.split()
    stop_words=set(stopwords.words('english'))
    stop_words.update(['–','vs','card','board'] + list(set(stopwords.words('spanish'))))
    if text_split[0] in stop_words:
        gen_title=text_generator(" ".join(text_split[:2]), len(text_split), generator_model, max_len)
    else:
        gen_title=text_generator(text_split[0], len(text_split), generator_model, max_len)
        
    return gen_title

In [99]:
# 2 layers and 256 units
rand_text=[9,8,9,8,8]
start_text=['board game','conan','united states','conquer','the story']

for rand,s_text in zip(rand_text,start_text):
    gen=text_generator(s_text, rand, generator_model, max_len)
    print(gen)

Board Game Cafe Frenzy The Card Game – The Age Of
Conan The Gathering – Duel Decks Merfolk Vs Inventors
United States Dice Masters Justice The Deserted Lighthouse The Gallipoli Campaign
Conquer Dice Game Rebirth Of The Righteous – The
The Story Of Life Pirates Of The Caribbean – The


In [142]:
# 2 layers 512 unis
rand_text=[9,8,9,8,8]
start_text=['board game','conan','united states','conquer','the story']

for rand,s_text in zip(rand_text,start_text):
    gen=text_generator(s_text, rand, generator_model, max_len)
    print(gen)

Board Game Cafe Frenzy 102 The Cities – The Peloponnesian War
Conan Epic Empresario The Devil Alien Game – The
United States Ultimate Romanian Campaign Game Of Magmaroth – – Signature
Conquer The Masquerade – Duel Decks Jace Vs Kiora
The Story Of The Rings The Two Towers Of The


In [161]:
# 4 layers 512 unis
rand_text=[9,8,9,8,8]
start_text=['board game','conan','united states','conquer','the story']

for rand,s_text in zip(rand_text,start_text):
    gen=text_generator(s_text, rand, generator_model, max_len)
    print(gen)

Board Game The The The The The The The The The
Conan The The The The The The The The
United States The The The The The The The The The
Conquer The The The The The The The The
The Story The The The The The The The The


In [15]:
# 4 layers 128 unis
rand_text=[9,8,9,8,8]
start_text=['board game','conan','united states','conquer','the story']

for rand,s_text in zip(rand_text,start_text):
    gen=text_generator(s_text, rand, generator_model, max_len)
    print(gen)

Board Game The Card Game – The Game Of The Bulge
Conan The Game Of The Bulge Dead – 1914
United States The Game – The Sorcerers Grenadier Game Justice Bubblegum
Conquer The Game Of The Bulge Dead – 1914
The Story Of Life The Game Of The Bulge Grenadier


In [31]:
# 2 layers loads of units
rand_text=[9,8,9,8,8]
start_text=['board game','conan','united states','conquer','the story']

for rand,s_text in zip(rand_text,start_text):
    gen=text_generator(s_text, rand, generator_model, max_len)
    print(gen)

Board Game Cafe Frenzy Card Game – Sharpshooter Vs Bruiser Goblins
Conan Battle Of The Asagiri Goddess Mayjune 1940 Edition
United States The Board Game – Revised Core Set – 100
Conquer The Strongest Explodes The Prison Street Irregulars Game
The Story Wars 18051815 Face Watch The Boogeymen – The


In [34]:
game_titles_test['generated_title']=game_titles_test.name.apply(generate_title)
game_titles_test.columns=['title_orig','title_gen']
display(game_titles_test.sample(2))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title_orig,title_gen
78313,King of the Dice,King Of The Caribbean World
76296,Rambo: The Board Game,Rambo: The Game – The


In [17]:
game_titles_test[['title_orig','title_gen']].to_csv('evaluation/title_generation_3.CSV')

In [18]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction, modified_precision
from nltk.translate import bleu
def bleu_score_2_titles(og_title, gen_title):
    og_title=og_title.split()
    gen_title=gen_title.split()
    chencherry = SmoothingFunction()
    return bleu([gen_title],og_title,smoothing_function=chencherry.method7)

In [35]:
game_titles_test['bleu_score']=game_titles_test.apply(lambda x: bleu_score_2_titles(x.title_orig, x.title_gen), axis=1)
game_titles_test.sample(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title_orig,title_gen,bleu_score
61684,Billionaire Banshee,Billionaire The Great,0.101641
57168,Koi Pond: A Coy Card Game,Koi Dice Masters The Amazing Spiderman Of,0.09424


In [36]:
from gensim.models import Word2Vec
import gensim
game_titles_test['combined']=game_titles_test['title_orig']+";"+game_titles_test['title_gen']
sent=[row.split(';') for row in game_titles_test['combined']]
w2v_model=Word2Vec(sent, min_count=1,vector_size= 50,workers=3, window =2, sg = 1)
w2v_model.build_vocab(sent, progress_per=10)
w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
game_titles_test['w2v_similarity_score']=game_titles_test.apply(lambda x: w2v_model.wv.similarity(x.title_orig, x.title_gen), axis=1)
game_titles_test.sample(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title_orig,title_gen,bleu_score,combined,w2v_similarity_score
80569,EXO: Mankind Reborn,Exo: The Great Bride,0.0,EXO: Mankind Reborn;Exo: The Great Bride,-0.11
62899,Trickerion: Legends of Illusion,Trickerion: The Great Bride The,0.108621,Trickerion: Legends of Illusion;Trickerion: Th...,0.005226


In [21]:
# generator_model.save('Models_Assets/generative_model_v4')
generator_model.save('evaluation/generative_model3')

2022-10-19 11:15:34.995955: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: evaluation/generative_model3/assets




In [157]:
import pickle
with open('generative_tokenizer_more_data.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
display(game_titles_test.sort_values(by='w2v_similarity_score',ascending=False)[['title_orig','title_gen','w2v_similarity_score','bleu_score']].head(8))
display(game_titles_test.sort_values(by='bleu_score',ascending=False)[['title_orig','title_gen','bleu_score','w2v_similarity_score']].head(8))
# game_titles_test[['title_orig','title_gen','bleu_score','w2v_similarity_score']].to_csv('evaluation/generative_model1_metrics.CSV')

Unnamed: 0,title_orig,title_gen,w2v_similarity_score,bleu_score
95860,Welcome to Sysifus Corp,Welcome To The Future Corregidor,0.398499,0.108621
103564,ECO: Coral Reef,Eco: The Great Bride,0.376985,0.0
74856,Rhino Hero: Super Battle,Rhino First Battles Of The,0.368058,0.108621
104997,Awimbawé,Awimbawé The,0.329408,0.070798
88073,Sherlock: Don's Legacy,Sherlock: Holmes Consulting Detective,0.328776,0.111686
61066,A Fistful of Dinero,A Fistful Of The Scorpion Clan,0.322661,0.156772
4554,Hilarium,Hilarium The,0.311533,0.070798
90898,Jekyll vs. Hyde,Jekyll Koro Fussball Gnomes,0.309664,0.111686


Unnamed: 0,title_orig,title_gen,bleu_score,w2v_similarity_score
90618,Exit: The Game – The Enchanted Forest,Exit: The Game – The Secret Lab Card,0.605825,-0.089743
79900,The Red Dragon Inn 7: The Tavern Crew,The Red Dragon Inn 6 Villains War 17541763 Sec...,0.317183,-0.032495
51768,Smash Up,Smash Up The,0.264385,0.165223
62596,Camelot: The Court,Camelot: The Card Game,0.229319,-0.172228
14165,Carcassonne: The Discovery,Carcassonne: The Lord Of,0.229319,0.149227
69253,Star Wars Trivia Game,Star Wars The Game Of,0.227984,-0.074345
76296,Rambo: The Board Game,Rambo: The Game – The,0.227984,-0.114067
70477,The Red Dragon Inn: Battle for Greyport,The Red Dragon Inn 6 Villains War 17541763 Second,0.211067,-0.104003


In [97]:
# df=pd.read_csv(r'evaluation/title_generation_3.CSV')
df=pd.read_csv(r'Generator_Outputs/CSV_Outputs/title_generation_metrics_v4.CSV')
game_titles_test=df
from gensim.models import Word2Vec
import gensim
game_titles_test['combined']=game_titles_test['title_orig']+";"+game_titles_test['title_gen']
sent=[row.split(';') for row in game_titles_test['combined']]
w2v_model=Word2Vec(sent, min_count=1,vector_size= 50,workers=3, window =2, sg = 1)
w2v_model.build_vocab(sent, progress_per=10)
w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
game_titles_test['w2v_similarity_score']=game_titles_test.apply(lambda x: w2v_model.wv.similarity(x.title_orig, x.title_gen), axis=1)
game_titles_test['bleu_score']=game_titles_test.apply(lambda x: bleu_score_2_titles(x.title_orig, x.title_gen), axis=1)
game_titles_test.sample(2)



Unnamed: 0.1,Unnamed: 0,title_orig,title_gen,bleu_score,w2v_similarity_score,combined
1267,79034,Bear Went Over the Mountain,Bear War The Card Game – The Board,0.066488,0.146236,Bear Went Over the Mountain;Bear War The Card ...
1371,54034,HomeStretch,Homestretch It The Game,0.0,-0.055203,HomeStretch;Homestretch It The Game
