## Modulos

In [1]:
## Data manipulation
import numpy as np
import pandas as pd

## Procesamiento del texto
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

## Text to vect
from sklearn.feature_extraction.text import TfidfVectorizer

## Distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

## Graphs
import plotly.express as px


## Load Data

In [3]:
df_boardgames= pd.read_csv("df_boardgames.csv")

In [4]:
url_data = (r'https://raw.githubusercontent.com/bluterplay/7moSemestre/main/Pokemon/games.csv')

data_csv = pd.read_csv(url_data)

data_csv.head()

Unnamed: 0,GameId,GameName,Description
0,148494,"1,2,3! Now you see me...",The animals on the farm are playing a game of ...
1,316377,7 Wonders (Second Edition),7 Wonders<br/>The board game with more awards ...
2,155987,Abyss,"The Abyss power is once again vacant, so the t..."
3,21569,Adigma,Adigma is a party game which involves simultan...
4,31260,Agricola,Description from BoardgameNews<br/><br/>In Agr...


In [7]:
prepro_boardgames=data_csv.copy()

## Text

### Drop HTML tags

In [8]:
prepro_boardgames["Description"]=prepro_boardgames["Description"].map(lambda x: x.replace("<br/>", " "))
prepro_boardgames["Description"]=prepro_boardgames["Description"].map(lambda x: x.replace("&quot;", " "))
prepro_boardgames["Description"]=prepro_boardgames["Description"].map(lambda x: x.replace("|", " "))

### Punctation characters

In [9]:
sep = '|'
punctuation_chars = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~'

In [10]:
mapping_table = str.maketrans(dict.fromkeys(punctuation_chars, ''))

In [11]:
text_features = ['Description']

In [12]:
for column in text_features:
    prepro_boardgames[column] = sep \
    .join(prepro_boardgames[column].tolist()) \
    .translate(mapping_table) \
    .split(sep)

### Lower cases

In [13]:
prepro_boardgames["Description"]=prepro_boardgames["Description"].map(lambda x: x.lower())
prepro_boardgames["Description"]=prepro_boardgames["Description"].map(lambda x: re.sub(' +', ' ', x))

In [14]:
data_text= prepro_boardgames.copy()

### Stop Words

In [15]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/bluterplay/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bluterplay/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
stopword_list  = stopwords.words('english')
stopword_list[:5]

['i', 'me', 'my', 'myself', 'we']

#### Split words

In [16]:
for column in text_features:
    prepro_boardgames[column] = prepro_boardgames[column].str.split()

#### Remove stopwords

In [19]:
for column in text_features:
    prepro_boardgames[column] = prepro_boardgames[column].apply(lambda x: [i for i in x if i not in stopword_list])
    prepro_boardgames[column]

In [20]:
prepro_boardgames.head()

Unnamed: 0,GameId,GameName,Description
0,148494,"1,2,3! Now you see me...","[animals, farm, playing, game, red, lightgreen..."
1,316377,7 Wonders (Second Edition),"[7, wonders, board, game, awards, game, planet..."
2,155987,Abyss,"[abyss, power, vacant, time, come, get, hands,..."
3,21569,Adigma,"[adigma, party, game, involves, simultaneous, ..."
4,31260,Agricola,"[description, boardgamenews, agricola, youre, ..."


### Stemer

In [21]:
stemmer = SnowballStemmer('english')

In [22]:
for column in text_features:
    prepro_boardgames[column] = prepro_boardgames[column].apply(lambda x: [stemmer.stem(i) for i in x])

#### Join words

In [23]:
for column in text_features:
    prepro_boardgames[column] = prepro_boardgames[column].apply(lambda x: ' '.join(x))

In [24]:
prepro_boardgames["Description"]=prepro_boardgames["Description"].map(lambda x: re.sub(' +', ' ', x))

In [25]:
prepro_boardgames.head()

Unnamed: 0,GameId,GameName,Description
0,148494,"1,2,3! Now you see me...",anim farm play game red lightgreen light decid...
1,316377,7 Wonders (Second Edition),7 wonder board game award game planet 30 inter...
2,155987,Abyss,abyss power vacant time come get hand throne p...
3,21569,Adigma,adigma parti game involv simultan riddl puzzl ...
4,31260,Agricola,descript boardgamenew agricola your farmer woo...


In [33]:
df_text= prepro_boardgames.copy()
df_text.to_csv("df_text.csv", index=False)

## Doc Term Matrix

### Select words by Tfi df

In [27]:
vectorizerTfi= TfidfVectorizer()
vecTfi= vectorizerTfi.fit_transform(df_text["Description"])

In [28]:
td_tfi = pd.DataFrame(vecTfi.todense())
td_tfi.columns = vectorizerTfi.get_feature_names_out()

term_document_matrix_tfi = td_tfi.T
term_document_matrix_tfi.columns = [str(x) for x in df_text.GameId.unique()]

In [29]:
term_document_matrix_tfi

Unnamed: 0,148494,316377,155987,21569,31260,161970,124742,17329,25643,12005,...,180771,321720,170416,224783,247585,282216,200750,224710,230345,248065
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
虚空に向けて銃を撃ち,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
見えないヴァンパイアを攻撃します,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
読めない文字で綴られた手紙,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
通常のカードセットに加えて,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
relevant_words= set()
for column in df_text.GameId.unique():
    ls_words=list(term_document_matrix_tfi.sort_values(by=str(column), ascending=False)[:5].index)
    for word in ls_words:
        relevant_words.add(word)

In [36]:
term_document_matrix= term_document_matrix_tfi.filter(items=list(relevant_words), axis=0).T

In [38]:
term_document_matrix.sample(5)

Unnamed: 0,swintus,persuas,unmatch,tokugawa,suburbia,seamless,plastic,state,360,bean,...,paddl,supermarket,taboo,export,pepper,concordia,maus,rogu,natur,orbit
303650,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
230244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061311,0.0
98351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
term_document_matrix.to_csv("top5.csv")

### Doc Term

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
dc_games= dict(df_boardgames["GameId"])

In [69]:
vect = CountVectorizer()  
vects = vect.fit_transform(df_text["Description"])

In [70]:
td = pd.DataFrame(vects.todense())

In [71]:
td.columns = vect.get_feature_names_out()
term_document_matrix = td.T

In [72]:
term_document_matrix.rename(columns=dc_games, inplace=True)

In [73]:
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

In [74]:
# Top 25 words 
top = term_document_matrix.sort_values(by ='total_count',ascending=False)[:100] 

In [77]:
top.T.to_csv("Top100.csv")

In [78]:
top

Unnamed: 0,148494,316377,155987,21569,31260,161970,124742,17329,25643,12005,...,321720,170416,224783,247585,282216,200750,224710,230345,248065,total_count
player,3,4,4,0,4,6,2,4,4,4,...,0,0,2,7,3,1,3,1,3,7531
game,1,9,2,2,5,4,3,2,2,0,...,3,1,3,5,4,0,0,0,3,6161
card,0,8,3,0,7,0,4,0,1,9,...,2,0,1,0,2,4,2,3,0,5256
one,1,1,1,0,3,0,0,2,1,1,...,1,2,0,0,0,0,1,0,1,2657
play,1,3,0,0,2,5,0,0,1,2,...,0,2,2,0,1,0,0,0,1,2231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
expans,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,338
money,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,338
locat,0,0,2,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,332
around,0,0,0,0,0,0,0,0,0,2,...,0,0,0,3,0,0,1,0,0,332


## Add text

In [4]:
df_top5=pd.read_csv("top5.csv")
df_top10=pd.read_csv("top10.csv")
df_top100=pd.read_csv("Top100.csv")

In [16]:
df_top100=df_top100.iloc[:-1,:]

In [17]:
df_top100.columns

Index(['Unnamed: 0', 'player', 'game', 'card', 'one', 'play', 'point', 'new',
       'take', 'use',
       ...
       'anoth', 'combin', 'even', 'possibl', 'help', 'expans', 'money',
       'locat', 'around', 'give'],
      dtype='object', length=101)

In [20]:
dc_description= dict(df_boardgames.set_index("GameId")["Description"])

In [30]:
ls_dfs= [df_top5,df_top10,df_top100]

In [33]:
 df_top100["Description"]= df['Unnamed: 0'].apply(lambda x: dc_description[int(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top100["Description"]= df['Unnamed: 0'].apply(lambda x: dc_description[int(x)])


In [35]:
df_top100.to_csv("top100.csv", index=False)

In [27]:
for df in ls_dfs:
        df["Description"]= df['Unnamed: 0'].apply(lambda x: dc_description[int(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Description"]= df['Unnamed: 0'].apply(lambda x: dc_description[int(x)])


In [31]:
for i, name in enumerate(["5","10","100"]):
    df= ls_dfs[i]
    df.to_csv("top"+name+".csv", index=False)