In [17]:
import pandas as pd
from gensim.corpora import *
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.models import CoherenceModel
from tqdm.auto import tqdm

In [18]:
df = pd.read_csv("capitalization.csv")
df = df[df.n_mean_caps > 0]
df = df[~df.activity.isna()] 
df.activity[:10]

1     Kindergarten | Fictional Character | Internet ...
2     Internet Media | Internet Media | Society | Ed...
6     Cars | Youth Organization | Society | Animatio...
8     Internet Media | Fashion, Clothing, Shoes | In...
10    Internet Media | Fictional Character | Humor |...
16    City Community | Internet Media | Internet Med...
23    Discussion Сlub | Humor | Internet Media | Gam...
27    Internet Media | Humor | Humor | Humor | Video...
33    Internet Media | Show, Program | Healthy Lifes...
43    Internet Media | Humor | Internet Media | Crea...
Name: activity, dtype: object

In [67]:
activities = df.activity.values.tolist()
activities = [i.split(" | ") for i in activities]

In [20]:
activities[0][:5]

['Kindergarten',
 'Fictional Character',
 'Internet Media',
 'Philosophy',
 'Humor']

# LDA model

In [21]:
dictionary = Dictionary(activities)
corpus = [dictionary.doc2bow(act) for act in activities]

In [22]:
n_topics = [7,8,9,10,11,12,13,14,15]

In [25]:
cohs = {}

for n in tqdm(n_topics):
    
    lda_train = gensim.models.ldamulticore.LdaMulticore(
                corpus=corpus,
                id2word=dictionary,
                num_topics = n,
                chunksize=100,
                workers = 3,
                eval_every = 1,
                per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_train, 
                                        texts=activities, 
                                        dictionary=dictionary, 
                                        coherence='c_v')
    
    coherence_lda = coherence_model_lda.get_coherence()
    cohs[n] = coherence_lda
    print(str(n), cohs[n])
    lda_train.save("lda_train_{}.model".format(str(n)))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

7 0.2976986931172914
8 0.30160144965037694
9 0.2956830818619501
10 0.30263046793503945
11 0.3118603297373
12 0.2971247314758559


In [29]:
n_topics = [13,14,15]

In [30]:
for n in tqdm(n_topics):
    
    lda_train = gensim.models.ldamulticore.LdaMulticore(
                corpus=corpus,
                id2word=dictionary,
                num_topics = n,
                chunksize=100,
                workers = 3,
                eval_every = 1,
                per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_train, 
                                        texts=activities, 
                                        dictionary=dictionary, 
                                        coherence='c_v')
    
    coherence_lda = coherence_model_lda.get_coherence()
    cohs[n] = coherence_lda
    print(str(n), cohs[n])
    lda_train.save("lda_train_{}.model".format(str(n)))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

13 0.30693996647968436
14 0.3060249535047446
15 0.2978738042804672


In [59]:
cohs

{7: 0.2976986931172914,
 8: 0.30160144965037694,
 9: 0.2956830818619501,
 10: 0.30263046793503945,
 11: 0.3118603297373,
 12: 0.2971247314758559,
 13: 0.30693996647968436,
 14: 0.3060249535047446,
 15: 0.2978738042804672}

Choose n = 11

In [32]:
lda_train = gensim.models.ldamulticore.LdaMulticore(
                corpus=corpus,
                id2word=dictionary,
                num_topics = 11,
                chunksize=100,
                workers = 3,
                eval_every = 1,
                per_word_topics=True, 
                passes = 50)
    
coherence_model_lda = CoherenceModel(model=lda_train, 
                                        texts=activities, 
                                        dictionary=dictionary, 
                                        coherence='c_v')
    
coherence_model_lda.get_coherence()

0.363537623154878

In [33]:
lda_train.save("lda_final.model")

In [60]:
lda_train.print_topics(11,num_words=15)

[(0,
  '0.252*"Football" + 0.134*"Sports Organization" + 0.070*"Sport" + 0.060*"Martial Arts" + 0.058*"Football Team" + 0.047*"Movies" + 0.036*"Athlete" + 0.028*"Fitness" + 0.027*"TV Channel" + 0.025*"Internet Media" + 0.023*"Show, Program" + 0.020*"Sports Club" + 0.019*"Business" + 0.015*"City Community" + 0.012*"Hockey"'),
 (1,
  '0.626*"Humor" + 0.022*"Youth Organization" + 0.021*"Movies" + 0.020*"Education" + 0.018*"Show, Program" + 0.018*"Science" + 0.016*"Photography" + 0.015*"Animation" + 0.014*"Creative Work" + 0.012*"Open group" + 0.012*"Literature" + 0.011*"Animals" + 0.010*"Discussion Сlub" + 0.009*"Society" + 0.009*"Public page"'),
 (2,
  '0.152*"Education" + 0.108*"Science" + 0.088*"Humor" + 0.071*"Movies" + 0.063*"Tourism, Travel" + 0.045*"Business" + 0.034*"History" + 0.027*"Society" + 0.027*"Philosophy" + 0.025*"Public page" + 0.024*"Photography" + 0.024*"Literature" + 0.021*"Creative Work" + 0.018*"Internet Media" + 0.012*"City Community"'),
 (3,
  '0.134*"Food, Recipe

In [62]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
# pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_train, corpus, dictionary)
pyLDAvis.display(LDAvis_prepared)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [36]:
activity_classes = []
activity_probs = []
for j in range(len(activities)):
    topic_dist = {v:k for k,v in lda_train.get_document_topics(corpus[j], minimum_probability=0.0)}
    max_prob = max(topic_dist.keys())
    max_class = topic_dist[max_prob]
    activity_classes.append(max_class)
    activity_probs.append(max_prob)

In [40]:
len(activity_probs)

13350

In [41]:
topics = lda_train.print_topics(11,num_words=15)
topics_d = {i[0]:i[1] for i in topics}

In [42]:
topics_d

{0: '0.252*"Football" + 0.134*"Sports Organization" + 0.070*"Sport" + 0.060*"Martial Arts" + 0.058*"Football Team" + 0.047*"Movies" + 0.036*"Athlete" + 0.028*"Fitness" + 0.027*"TV Channel" + 0.025*"Internet Media" + 0.023*"Show, Program" + 0.020*"Sports Club" + 0.019*"Business" + 0.015*"City Community" + 0.012*"Hockey"',
 1: '0.626*"Humor" + 0.022*"Youth Organization" + 0.021*"Movies" + 0.020*"Education" + 0.018*"Show, Program" + 0.018*"Science" + 0.016*"Photography" + 0.015*"Animation" + 0.014*"Creative Work" + 0.012*"Open group" + 0.012*"Literature" + 0.011*"Animals" + 0.010*"Discussion Сlub" + 0.009*"Society" + 0.009*"Public page"',
 2: '0.152*"Education" + 0.108*"Science" + 0.088*"Humor" + 0.071*"Movies" + 0.063*"Tourism, Travel" + 0.045*"Business" + 0.034*"History" + 0.027*"Society" + 0.027*"Philosophy" + 0.025*"Public page" + 0.024*"Photography" + 0.024*"Literature" + 0.021*"Creative Work" + 0.018*"Internet Media" + 0.012*"City Community"',
 3: '0.134*"Food, Recipes" + 0.078*"Per

In [43]:
class_descriptions = [topics_d[i] for i in activity_classes]

In [44]:
class_descriptions[0]

'0.196*"Creative Work" + 0.098*"Photography" + 0.053*"Culture" + 0.042*"Literature" + 0.039*"Animals" + 0.037*"Artist" + 0.032*"Design" + 0.028*"Fashion, Clothing, Shoes" + 0.028*"Movies" + 0.025*"Animation" + 0.020*"Education" + 0.018*"Fan Club" + 0.015*"Museum, Gallery, Exhibition" + 0.013*"Cultural Center" + 0.013*"Religion"'

In [45]:
df["class"] = activity_classes
df["class_probs"] = activity_probs
df["class_descr"] = class_descriptions

In [66]:
df.iloc[:, -3:]

Unnamed: 0,class,class_probs,class_descr
1,6,0.404068,"0.196*""Creative Work"" + 0.098*""Photography"" + ..."
2,1,0.342161,"0.626*""Humor"" + 0.022*""Youth Organization"" + 0..."
6,1,0.494793,"0.626*""Humor"" + 0.022*""Youth Organization"" + 0..."
8,6,0.425176,"0.196*""Creative Work"" + 0.098*""Photography"" + ..."
10,7,0.393231,"0.197*""Internet Media"" + 0.099*""City Community..."
16,7,0.417666,"0.197*""Internet Media"" + 0.099*""City Community..."
23,1,0.473378,"0.626*""Humor"" + 0.022*""Youth Organization"" + 0..."
27,9,0.555358,"0.218*""Video Games"" + 0.135*""Games"" + 0.061*""H..."
33,6,0.580152,"0.196*""Creative Work"" + 0.098*""Photography"" + ..."
43,7,0.322368,"0.197*""Internet Media"" + 0.099*""City Community..."


In [56]:
df.groupby(by="class").from_id.count()

class
0      222
1     3766
2     2442
3     1120
4      802
5      850
6     1169
7     1222
8      170
9      832
10     755
Name: from_id, dtype: int64

In [57]:
df.to_csv("capitalization-interests.csv", index=False)