In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
nltk.download('wordnet')
nltk.download('stopwords')
#sklearn.decomposition.LatentDirichletAllocation

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv('data.csv')
data.fillna('', inplace=True)
data.head()

Unnamed: 0,contest,problem_name,problem_statement,problem_tags
0,325,A,You are given n rectangles. The corners of rec...,"implementation,*1500"
1,325,B,Daniel is organizing a football tournament. He...,"binarysearch,math,*1800"
2,325,C,Piegirl has found a monster and a book about m...,"dfsandsimilar,graphs,shortestpaths,*2600"
3,325,D,"In a far away land, there exists a planet shap...","dsu,*2900"
4,325,E,Piegirl found the red button. You have one las...,"combinatorics,dfsandsimilar,dsu,graphs,greedy,..."


In [3]:
print(f'quantidade de problemas: {data.shape[0]}')
print(f'quantidade de contests: {len(data.contest.unique())}')
print(f'problemas repetidos: {data.shape[0] - len(data.problem_statement.unique())}')

data.drop_duplicates(subset='problem_statement', inplace=True)

quantidade de problemas: 8343
quantidade de contests: 1424
problemas repetidos: 1524


## Pre-processamento

In [24]:
sw = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
text = data.problem_statement

# Problem_statement
text = text.apply(lambda x: re.sub('[,|.|(|)|$|!|?|!]',' ',x)) # removendo caracteres especiais
text = text.apply(lambda x: [i for i in x.split() if i.lower() not in sw]) # removendo stop-words
text = text.apply(lambda x: ' '.join([lemma.lemmatize(i) for i in x])) # lemmatizando todo
# bigrama e trigrama todo

# Topics
topics = data.problem_tags
topics = topics.apply(lambda x: re.sub('[*][0-9]+','',x)) # remove os ratings *800 etc
topics = topics.apply(lambda x: [i for i in x.split(',') if i != '']) # tira strings vazias

In [5]:
text

0       given n rectangle corner rectangle integer coo...
1       Daniel organizing football tournament come fol...
2       Piegirl found monster book monster pie reading...
3       far away land exists planet shaped like cylind...
4       Piegirl found red button one last chance chang...
                              ...                        
8338    n block arranged row numbered left right start...
8339    map capital Berland viewed infinite coordinate...
8340    play strategic video game yeah ran good proble...
8341    first let's define function f x follows: \begi...
8342    Recently lot student enrolled Berland State Un...
Name: problem_statement, Length: 6819, dtype: object

In [25]:
topics

0                                        [implementation]
1                                    [binarysearch, math]
2                  [dfsandsimilar, graphs, shortestpaths]
3                                                   [dsu]
4       [combinatorics, dfsandsimilar, dsu, graphs, gr...
                              ...                        
8338                                       [greedy, math]
8339       [bruteforce, geometry, greedy, implementation]
8340    [datastructures, dp, greedy, implementation, s...
8341              [binarysearch, combinatorics, dp, math]
8342                                         [bruteforce]
Name: problem_tags, Length: 6819, dtype: object

## Vertorizando

In [7]:
max_df = 0.8
min_df = 0.05
vec = CountVectorizer(max_df=max_df, min_df=min_df)

In [8]:
X = vec.fit_transform(text)
X.shape

(6819, 250)

## Agrupando

In [21]:
# Pegando quantidade de topicos

total_topics = set()
topics = topics.apply(lambda x: [total_topics.add(i) for i in x])

In [22]:
n_total_topics = len(total_topics)
print(f'Número de Tópicos: {n_total_topics}')
total_topics

Número de Tópicos: 37


{'*specialproblem',
 '2-sat',
 'binarysearch',
 'bitmasks',
 'bruteforce',
 'chineseremaindertheorem',
 'combinatorics',
 'constructivealgorithms',
 'datastructures',
 'dfsandsimilar',
 'divideandconquer',
 'dp',
 'dsu',
 'expressionparsing',
 'fft',
 'flows',
 'games',
 'geometry',
 'graphmatchings',
 'graphs',
 'greedy',
 'hashing',
 'implementation',
 'interactive',
 'math',
 'matrices',
 'meet-in-the-middle',
 'numbertheory',
 'probabilities',
 'schedules',
 'shortestpaths',
 'sortings',
 'strings',
 'stringsuffixstructures',
 'ternarysearch',
 'trees',
 'twopointers'}

In [11]:
n_components = n_total_topics
max_iter = 100

lda = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter)

In [12]:
lda_matrix = lda.fit_transform(X)

In [13]:
top_word = lda.components_
terms = vec.get_feature_names_out()

for i, comp in enumerate(top_word):
    top_terms_key = zip(terms, comp)
    top_terms_key= sorted(top_terms_key, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list= list(dict(top_terms_key).keys())
    print("Topic "+str(i)+": ",top_terms_list)

Topic 0:  ['square', 'answer', 'correct', 'side', 'absolute', 'length', 'size', 'exceed', '10', 'considered']
Topic 1:  ['city', 'one', 'two', 'possible', 'way', 'th', 'connected', 'next', 'print', 'numbered']
Topic 2:  ['two', 'sample', 'modulo', 'different', '109', 'answer', 'single', '000', 'one', 'second']
Topic 3:  ['print', 'without', 'yes', 'otherwise', 'quote', 'no', 'two', 'second', 'possible', 'impossible']
Topic 4:  ['query', 'type', 'answer', 'one', 'two', 'th', 'next', 'second', 'following', 'guaranteed']
Topic 5:  ['time', 'second', 'moment', 'one', 'start', 'take', 'end', 'go', 'th', 'get']
Topic 6:  ['one', 'possible', 'second', 'maximum', 'minimum', 'two', 'want', 'total', 'get', 'make']
Topic 7:  ['segment', 'good', 'length', 'one', 'given', 'example', 'end', 'even', 'call', 'exactly']
Topic 8:  ['value', 'position', 'equal', 'maximum', 'let', 'calculate', 'mean', 'initial', 'consider', 'inclusive']
Topic 9:  ['tree', 'vertex', 'edge', 'given', 'path', 'two', 'next', 

## Validando

In [14]:
# todo

## Visualizando

In [15]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

panel = pyLDAvis.sklearn.prepare(lda, X, vec, mds='tsne')
panel
# todo

  by='saliency', ascending=False).head(R).drop('saliency', 1)
