In [196]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook
import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [197]:
lang = "en"

In [198]:
df_comb = pd.read_excel("../files/input/combined_data.xlsx")

In [199]:
df_comb.head()

Unnamed: 0,person,content
0,George Clooney,"George Timothy Clooney (born May 6, 1961) is a..."
1,Shah Rukh Khan,Shah Rukh Khan (pronounced [ˈʃɑːɦɾʊx xɑːn]; bo...
2,Leonardo DiCaprio,Leonardo Wilhelm DiCaprio (; Italian: [diˈkaːp...
3,Will Smith,"Willard Carroll Smith II (born September 25, ..."
4,Kamal Haasan,Kamal Haasan (born 7 November 1954) is an Indi...


In [200]:
## lang detect python

In [201]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

df_comb["actual_lang"] = df_comb["content"].apply(detect)

In [202]:
## add this comment to code

a = df_comb[df_comb["actual_lang"]=="en"]
a[a.duplicated('person')]

Unnamed: 0,person,content,actual_lang
55,Kamal Haasan,"Kamal Haasan (Ramanathapuram, 7 novembre 1954)...",en
78,James Patterson,"James Patterson (Newburgh, 22 marzo 1947) è un...",en


In [203]:
df_comb.actual_lang.value_counts()

en    53
ru    51
it    49
Name: actual_lang, dtype: int64

In [204]:
## filter to required lang

In [205]:
df = df_comb[df_comb["actual_lang"] == str(lang)].reset_index(drop=True)

In [206]:
df.head()

Unnamed: 0,person,content,actual_lang
0,George Clooney,"George Timothy Clooney (born May 6, 1961) is a...",en
1,Shah Rukh Khan,Shah Rukh Khan (pronounced [ˈʃɑːɦɾʊx xɑːn]; bo...,en
2,Leonardo DiCaprio,Leonardo Wilhelm DiCaprio (; Italian: [diˈkaːp...,en
3,Will Smith,"Willard Carroll Smith II (born September 25, ...",en
4,Kamal Haasan,Kamal Haasan (born 7 November 1954) is an Indi...,en


In [None]:
import warnings
warnings.filterwarnings('ignore')
lang_models = {"en": spacy.load("en_core_web_md"), "it": spacy.load("it_core_news_md"), "ru": spacy.load("ru_core_news_md")}

In [None]:
# Our spaCy model:
nlp = lang_models[str(lang)]
# Tags I want to remove from the text
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
tokens = []
for summary in nlp.pipe(df['content']):
    proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
    tokens.append(proj_tok)

In [None]:
df['tokens'] = tokens
df['tokens'][:3]

In [None]:
# I will apply the Dictionary Object from Gensim, which maps each word to their unique ID:
dictionary = Dictionary(df['tokens'])

In [None]:
print(dictionary.token2id)

In [None]:
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in df['tokens']]

In [None]:
# lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

In [None]:
topics = []
score = []
for i in range(1,10,1):
    print("Running iteration number " + str(i))
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
    cm = CoherenceModel(model=lda_model, texts = df['tokens'], corpus=corpus, dictionary=dictionary, coherence='c_v')
    topics.append(i)
    score.append(cm.get_coherence())
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.plot(topics, score)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Coherence Score Analysis for ' + str(lang))
fig.patch.set_facecolor('white')
plt.show()
fig.savefig("../files/output/" + str(lang) + "/coherence_score_" + str(lang) + ".png", bbox_inches='tight')

In [None]:
num_topics = topics[score.index(max(score))]
num_topics

In [189]:
print("Number of topics with highest coherence score for " + str(lang) + " model is: " + str(num_topics))

Number of topics with highest coherence score for it model is: 6


In [190]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, num_topics=num_topics, workers = 4, passes=100)

In [191]:
lda_model.print_topics(-1)

[(0,
  '0.054*"album" + 0.044*"singolo" + 0.023*"brano" + 0.022*"canzone" + 0.021*"artista" + 0.020*"cantante" + 0.017*"video" + 0.015*"billboard" + 0.015*"classifica" + 0.013*"you"'),
 (1,
  '0.057*"regia" + 0.044*"attore" + 0.020*"interpretare" + 0.019*"miglior" + 0.016*"oscar" + 0.015*"tv" + 0.014*"recitare" + 0.012*"candidatura" + 0.012*"regista" + 0.011*"dirigere"'),
 (2,
  '0.064*"romanzo" + 0.048*"libro" + 0.029*"scrittore" + 0.024*"brown" + 0.024*"autore" + 0.023*"opera" + 0.019*"isbn" + 0.013*"raccolta" + 0.013*"britannico" + 0.012*"editore"'),
 (3,
  '0.018*"clinton" + 0.017*"obama" + 0.013*"partito" + 0.010*"elezione" + 0.010*"politico" + 0.008*"democratico" + 0.008*"campagna" + 0.007*"bill" + 0.007*"legge" + 0.007*"società"'),
 (4,
  '0.045*"finale" + 0.030*"stagione" + 0.022*"vittoria" + 0.021*"partita" + 0.021*"oro" + 0.021*"record" + 0.020*"m" + 0.020*"battere" + 0.019*"open" + 0.015*"set"'),
 (5,
  '0.064*"regia" + 0.037*"attore" + 0.023*"rock" + 0.019*"candidatura" + 0

In [192]:
# set sort_topics to be False to ensure same order as gensim model
# thing to take note of is that gensim starts from 0, pyldavis starts from 1
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(lda_display, "../files/output/" + str(lang) + "/lda_" + str(lang) + ".html")

In [193]:
df.head()

Unnamed: 0,person,content,actual_lang,tokens
0,George Clooney,"George Timothy Clooney (Lexington, 6 maggio 19...",it,"[george, timothy, clooney, lexington, maggio, ..."
1,Shah Rukh Khan,"Shah Rukh Khan, all'anagrafe Shahrukh Khan, co...",it,"[shah, rukh, khan, anagrafe, shahrukh, khan, c..."
2,Leonardo DiCaprio,"Leonardo Wilhelm DiCaprio (Los Angeles, 11 nov...",it,"[leonardo, wilhelm, dicaprio, los, angeles, no..."
3,Will Smith,"Will Smith, all'anagrafe Willard Carroll Smith...",it,"[will, smith, anagrafe, willard, carroll, smit..."
4,Tom Cruise,"Thomas Cruise Mapother IV, noto semplicemente ...",it,"[thomas, cruise, mapother, iv, noto, tom, crui..."


In [194]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['tokens']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, df], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df['tokens'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index(drop=True)
df_dominant_topic.columns = ['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords']
df_dominant_topic = pd.concat([df,df_dominant_topic], axis=1)
df_dominant_topic

Unnamed: 0,person,content,actual_lang,tokens,Dominant_Topic,Topic_Perc_Contrib,Keywords
0,George Clooney,"George Timothy Clooney (Lexington, 6 maggio 19...",it,"[george, timothy, clooney, lexington, maggio, ...",1,0.9992,"regia, attore, interpretare, miglior, oscar, t..."
1,Shah Rukh Khan,"Shah Rukh Khan, all'anagrafe Shahrukh Khan, co...",it,"[shah, rukh, khan, anagrafe, shahrukh, khan, c...",5,0.9974,"regia, attore, rock, candidatura, it, king, mi..."
2,Leonardo DiCaprio,"Leonardo Wilhelm DiCaprio (Los Angeles, 11 nov...",it,"[leonardo, wilhelm, dicaprio, los, angeles, no...",1,0.7735,"regia, attore, interpretare, miglior, oscar, t..."
3,Will Smith,"Will Smith, all'anagrafe Willard Carroll Smith...",it,"[will, smith, anagrafe, willard, carroll, smit...",5,0.8898,"regia, attore, rock, candidatura, it, king, mi..."
4,Tom Cruise,"Thomas Cruise Mapother IV, noto semplicemente ...",it,"[thomas, cruise, mapother, iv, noto, tom, crui...",5,0.6583,"regia, attore, rock, candidatura, it, king, mi..."
5,Dwayne Johnson,"Dwayne Douglas Johnson, conosciuto anche come ...",it,"[dwayne, douglas, johnson, conoscere, rock, ha...",5,0.6267,"regia, attore, rock, candidatura, it, king, mi..."
6,Brad Pitt,"William Bradley Pitt, detto Brad (Shawnee, 18 ...",it,"[william, bradley, pitt, brad, shawnee, dicemb...",1,0.999,"regia, attore, interpretare, miglior, oscar, t..."
7,Johnny Depp,"Johnny Depp, all'anagrafe John Christopher Dep...",it,"[johnny, depp, anagrafe, john, christopher, de...",1,0.4817,"regia, attore, interpretare, miglior, oscar, t..."
8,Morgan Freeman,"Morgan Freeman (Memphis, 1º giugno 1937) è un ...",it,"[morgan, freeman, memphis, giugno, attore, pro...",5,0.9828,"regia, attore, rock, candidatura, it, king, mi..."
9,Ed Sheeran,"Edward Christopher Sheeran (Halifax, 17 febbra...",it,"[edward, christopher, sheeran, halifax, febbra...",0,0.8005,"album, singolo, brano, canzone, artista, canta..."


In [195]:
writer = pd.ExcelWriter('../files/output/' +str(lang) + '/topic_modelling_output_' + str(lang) + '.xlsx' , engine='xlsxwriter')
workbook = writer.book
workbook.formats[0].set_font_size(12)
df_dominant_topic.to_excel(writer, sheet_name = 'Output', index=False)

for idx, col in enumerate(df_dominant_topic):  # loop through all columns
    series = df_dominant_topic[col]
    max_len = 20
    writer.sheets['Output'].set_column(idx, idx, max_len)  # set column width
    
writer.close()

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
