# Function Goal

This Notebook aims to cluster activites of providers


In [23]:
import sys
print(sys.version)
print(sys.executable)

3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
/opt/anaconda3/bin/python


# Imports

In [24]:
# import classic libraries
# ------------------------
import pandas as pd
import sys
import os
import pickle
import re
from importlib import reload
import numpy as np
from sklearn import preprocessing
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import scipy
from scipy.sparse import coo_matrix, hstack

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE as TSNE_scikit
# pd.set_option('max_colwidth', 86)

In [25]:
# import custom library
# ------------------------
import os
current_path = os.path.abspath(os.getcwd())
path_lib = current_path.replace('src','')
path_data = current_path.replace('src','data')
sys.path.append(path_lib)

import lib_ar_src
reload(lib_ar_src)
print(lib_ar_src.__version__)

1.0.0


In [26]:
from lib_ar_src.utils.io_utils import importData
from lib_ar_src.clustering.display import plotClusterExample
from lib_ar_src.clustering.display import printCluster
from lib_ar_src.clustering.display import printClusterDistribution
from lib_ar_src.clustering.display import printClusterHomogeneityDistribution
from lib_ar_src.clustering.display import plotClusters
from lib_ar_src.clustering.display import plot_heatmap_dbscan
from lib_ar_src.clustering.display import plot_homog_hdbscan
from lib_ar_src.clustering.display import print_biggestClusters

# Function definition

In [27]:
def html2text(html):
    """
    Remove html tags
    """
    from bs4 import BeautifulSoup
    try:
        soup = BeautifulSoup(html)
        text = soup.get_text()
    except:
        text = html
        
    return text

def intConverter(x):
    ''' robust cast to int '''
    out = -1
    try:
        out = int(x)
    except:
        print('error nan?')
    return out

def floatConverter(x):
    ''' robust cast to int '''
    out = -1
    try:
        out = float(x)
    except:
        print('error nan?')
    return out


def prepare_data_4_clustering(X, df, dict_weight={}, bool_PCA=False, bool_standardization=False):
    """
    preprocess data for clustering
    - normalization by min max
    - PCE (eventual)
    - standardization (eventual)
    """
    
    from sklearn import preprocessing
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA

    # Normalisation - scaling
    # -----------------------
    scaler = preprocessing.MinMaxScaler().fit(X) # nb with_mean=False obligatoire pour sparse matrix
    Xnorm_raw = scaler.transform(X)
    plt.figure(figsize=(10,10))
    sns.heatmap(Xnorm_raw)
    plt.title('data after scaling')

    # Weighting
    # --------
    list_weights = [] 
    for col in df: # for each column
        default_weight = 1
        for key in dict_weight: # test if belonging to a category
            if (col.find(key) != -1): # if yes
                default_weight = dict_weight[key] # retrieve the appropriate weight
        list_weights.append(default_weight) # append the weight

    Xnorm_weighted = Xnorm_raw
    for col in range(Xnorm_weighted.shape[1]):
        Xnorm_weighted[:,col] = list_weights[col]*Xnorm_weighted[:,col]
    plt.figure(figsize=(10,10))
    sns.heatmap(Xnorm_weighted)
    plt.title('data after weighting')

    # PCA (to keep size under control)
    # ---------------------------------
    if bool_PCA:
        pca = PCA(n_components=100)
        Xnorm = pca.fit_transform(Xnorm_weighted) # conversion from sparse to array is necessary
        print('total explained variance of PCA is : ', sum(pca.explained_variance_ratio_))
        plt.figure(figsize=(10,10))
        sns.heatmap(abs(Xnorm))
        plt.title('data after PCA')
    else:
        Xnorm = Xnorm_weighted


    # Data standardization (eventual)
    # ------------------------------
    if bool_standardization:
        scaler = preprocessing.StandardScaler().fit(Xnorm ) # nb with_mean=False obligatoire pour sparse matrix
        X_total_scaled = scaler.transform(Xnorm)

        plt.figure(figsize=(10,10))
        sns.heatmap(X_total_scaled)
    else:
        X_total_scaled = Xnorm

    # Data used for fitting
    # ---------------------
    x_data = X_total_scaled
    
    return x_data

def create_corpus_from_text_columns(df, col_list):
    """ create a corpus for BAG of word, from specified columns of a df """
    corpus = ''
    for ir in range(df.shape[0]):
        for col in col_list:
            elem = df.loc[ir,col]
            corpus += ' '
            corpus += str(elem)

    # corpus_cleaned
    from lib_ar_src.utils.text_utils import clean_text
    corpus_cleaned = clean_text(corpus)
    corpus_cleaned = re.sub('[0-9]', '', corpus_cleaned)

    return corpus_cleaned

def create_bow(corpus, maxF=100):
    '''
    Create bow model
    '''
    from sklearn.feature_extraction.text import CountVectorizer 
    
    try: # create a list of french and english stopwords
        from nltk.corpus import stopwords    
        final_stopwords_list = stopwords.words('english') \
        + stopwords.words('french') \
        + stopwords.words('german') \
        + stopwords.words('italian')
        
    except:  # if first time, download stopwords list
        import nltk
        nltk.download('stopwords')
        from nltk.corpus import stopwords    
        final_stopwords_list = stopwords.words('english') \
        + stopwords.words('french') \
        + stopwords.words('german') \
        + stopwords.words('italian') 
        
    # create the model
    vectorizer = CountVectorizer(
        max_features=maxF,
        analyzer = 'word',
        stop_words=final_stopwords_list
    )
    bow = vectorizer.fit_transform(corpus)

    return bow, vectorizer


def enrich_with_bow(df, column_name, vectorizer):
    '''
    Enrich a dataframe with bow model applied on a column
    Adds new columns with counts of most common words
    '''
    df_enriched = df.copy()    
    df_bow_mat = vectorizer.transform(df[column_name])
    feature_names = vectorizer.get_feature_names()
    feature_names_formatted = ['bow_'+column_name+'_'+f for f in feature_names]
    df_bow = pd.DataFrame(df_bow_mat.toarray(), columns=feature_names_formatted)   
    df_enriched = pd.concat([df_enriched, df_bow], axis=1)
   
    return df_enriched


def replace_cols_with_bow(df, list_cols_bow, maxF=50):
    """" Replace specified columns by bow  """
    
    df_bow = df.copy()
    
    # force string format
    for col in list_cols_bow:
        df_bow[col] = df_bow[col].apply(lambda x : str(x))
    
    # create a corpus to train BOW    
    corpus_cleaned = create_corpus_from_text_columns(df_bow, list_cols_bow)

    # create bow features
    bow, vectorizer = create_bow([corpus_cleaned], maxF=maxF)
    print('vocabulary used for bow is : ', vectorizer.vocabulary_)

    # add bow features
    for col in list_cols_bow:
        df_bow = enrich_with_bow(df_bow, col, vectorizer)

    # remove textual columns
    from lib_ar_src.utils.df_utils import drop_columns
    df_bow = drop_columns(df_bow, list_cols_bow)
    
    return df_bow

In [28]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }   div.cell.selected {border-left-width: 1px !important;} </style>"))

# Import data

In [29]:
!ls ../data

dates.json
events activities.json
extract BK data per location activities.xlsx
extract BK data per location vendors.xlsx
extract BK data per location.xlsx
products.json
vendors.json


In [30]:
# fake or incomplete data
# -----------------------
# df_fake_dates_raw   = importData(os.path.join(path_data, 'dates.json'), 10000)
# df_fake_events_raw   = importData(os.path.join(path_data, 'events activities.json'), 10000)
# df_fake_products_raw   = importData(os.path.join(path_data, 'products.json'), 10000)
# df_fake_vendors_raw   = importData(os.path.join(path_data, 'vendors.json'), 10000)

# complete data (coming from firestore)
# -------------------------------------
df_activities_raw   = importData(os.path.join(path_data, 'extract BK data per location activities.xlsx'), 10000)
df_vendors_raw   = importData(os.path.join(path_data, 'extract BK data per location vendors.xlsx'), 10000)


number of samples : 2354


Unnamed: 0,id,active,root_event_id,title,description,type,duration,pre_time,max_participants,location,...,images/42,images/43,images/44,tags/3/values/2,tags/3/values/3,country_of_residence,address_pos,city_pos,country_pos,city
0,f9d00ddbf493f6ff340cd77d37660f7a,1,,BRAVO SPORT Fußballcamps - SV 1922 Zwiesel 28....,"<p>Zu cool, um wahr zu sein? Von wegen! Die BR...",BOOKING,720.0,0.0,20,"Rotwaldsiedlung 13, 94227 Zwiesel, Deutschland",...,,,,,,,19.0,34.0,,94227 Zwiesel
1,691d5a6b83ab3aff8561ba691621b97a,1,,ONLINE ESCAPE GAME: AUSGANGSSPERRE,<p><strong>Wir bringen Euch den R&auml;tselspa...,BOOKING,120.0,0.0,1000,@HOME,...,,,,,,,,,,@HOME


number of samples : 191


Unnamed: 0,id,name,url,service_email,service_phone,image,terms,bill_country,country_of_residence,activation_date
0,4379934535e774c3a53b956677f92c0e,Azienda Agricola Bocale di Valentini,https://www.bocale.wine/,info@bocale.it,390742399233,https://cdn.bookingkit.de/vendor_images/f7a2b2...,<p>In merito alla nostra politica di cancellaz...,IT,IT,2021-01-11T14:55:37Z
1,fe595f13d6bc9618c45f6c3dfc73fd71,Stadtführungen Kassel,https://www.stadtfuehrungen-kassel.de,panetta-moeller@t-online.de,4956167994,https://cdn.bookingkit.de/vendor_images/934d57...,<p><br />Allgemeine Gesch&auml;ftsbedingungen<...,DE,DE,2021-01-11T14:55:36Z


# Clean data

In [31]:
# for col in df_activities_raw:
#     print(col)

In [32]:
df_activities_clean = df_activities_raw.copy()
df_vendors_clean =  df_vendors_raw.copy()

# clean html into text for activities
html_columns = ['bring', 'advice', 'hint', 'participant_hint', 'title', 'terms', 'description']
# 
for col in html_columns:
    if col in df_activities_clean:
        df_activities_clean[col] = df_activities_clean[col].apply(lambda x : html2text(x))
    if col in df_vendors_clean:
        df_vendors_clean[col] = df_vendors_clean[col].apply(lambda x : html2text(x))

# Merge data

In [33]:
# Display available columns
# -------------------------
# for col in df_activities_raw:
#     print(col)

In [34]:
# merge on vendor id
df_merge = pd.merge(df_activities_clean, df_vendors_clean, left_on=['vendor_id'], right_on = 'id', how='left')
print(df_merge.shape[0])
df_merge = df_merge.fillna(-1)
df_merge=df_merge[df_merge["country_of_residence_y"]=='DE']
df_merge

2354


Unnamed: 0,id_x,active,root_event_id,title,description,type,duration,pre_time,max_participants,location,...,id_y,name,url,service_email,service_phone,image,terms,bill_country,country_of_residence_y,activation_date
0,f9d00ddbf493f6ff340cd77d37660f7a,1,-1,BRAVO SPORT Fußballcamps - SV 1922 Zwiesel 28....,"Zu cool, um wahr zu sein? Von wegen! Die BRAVO...",BOOKING,720.0,0.0,20,"Rotwaldsiedlung 13, 94227 Zwiesel, Deutschland",...,3b8bcc9344179bdfff454eaadbf03684,erlebnis-buchen.com,erlebnis-buchen.com,noreply@bookingkit.de,030609850332,https://cdn.bookingkit.de/vendor_images/edc56c...,<p>AGB der Demo Inc.</p>,DE,DE,2021-01-06T12:28:50Z
1,691d5a6b83ab3aff8561ba691621b97a,1,-1,ONLINE ESCAPE GAME: AUSGANGSSPERRE,Wir bringen Euch den Rätselspaß nach Hause!\n ...,BOOKING,120.0,0.0,1000,@HOME,...,8c5f7b595bd74eeb915ffa1ae647463a,Dresden Secrets - Escape Games & iPad Rallyes,www.dresden-secrets.de,info@cbikes.de,+4935165318888,https://cdn.bookingkit.de/vendor_images/27b7dc...,"1. Vertragsinhalt Dresden Secrets, Markus Brac...",DE,DE,2021-01-11T14:54:51Z
2,729a82be1e45148c699794fdffefe954,1,-1,ONLINE ESCAPE GAME: LOST CHRISTMAS,Wir bringen Euch den Rätselspaß nach Hause!\nK...,BOOKING,120.0,0.0,1000,@HOME,...,8c5f7b595bd74eeb915ffa1ae647463a,Dresden Secrets - Escape Games & iPad Rallyes,www.dresden-secrets.de,info@cbikes.de,+4935165318888,https://cdn.bookingkit.de/vendor_images/27b7dc...,"1. Vertragsinhalt Dresden Secrets, Markus Brac...",DE,DE,2021-01-11T14:54:51Z
3,c4488666ffceb5de00ec524a45678725,1,-1,Gutschein,"Ob zum Geburtstag, zu Weihnachten, zur Hochzei...",BOOKING,0.0,0.0,1,"Theodor-Heuss-Straße 32, 70174 Stuttgart, Deut...",...,c04d4dcf8618d01cb3ad1c01d5d34e7b,ExitGames Stuttgart,www.exitgames-stuttgart.de,info@exitgames-stuttgart.de,(+49) 0711/18424240,https://cdn.bookingkit.de/vendor_images/bf573f...,AGB,DE,DE,2021-01-11T14:54:25Z
4,98a16d521d47c2f8f09908610fcabd6f,1,-1,ONLINE ESCAPE GAME: DIAMANTENFIEBER,Wir bringen Euch den Rätselspaß nach Hause!\nK...,BOOKING,120.0,0.0,1000,@HOME,...,8c5f7b595bd74eeb915ffa1ae647463a,Dresden Secrets - Escape Games & iPad Rallyes,www.dresden-secrets.de,info@cbikes.de,+4935165318888,https://cdn.bookingkit.de/vendor_images/27b7dc...,"1. Vertragsinhalt Dresden Secrets, Markus Brac...",DE,DE,2021-01-11T14:54:51Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2330,8335bdc9973ef26681923f632b1b171c,1,-1,Mentoren Training,Lassen Sie sich zum Mentoren Spezialisten ausb...,BOOKING,4320.0,15.0,5,"Hof, Deutschland",...,4cb7bc2b927ffc1018c491dfc34071e7,Prealize GmbH,www.prealize.de,cornelia.jeschek@prealize.de,+491715320755,https://cdn.bookingkit.de/vendor_images/b8543b...,Allgemeine Geschäftsbedingungen (AGB) \nfür On...,DE,DE,2021-01-11T14:55:36Z
2331,55290f6cecd9129b47d15f57c0cc28e2,1,-1,Live Seminar zum Thema - Erfolgreicher Vertrie...,In 4 ausgewählten und erfolgsorientierten Modu...,BOOKING,180.0,600.0,10,"Berlin, Deutschland",...,4cb7bc2b927ffc1018c491dfc34071e7,Prealize GmbH,www.prealize.de,cornelia.jeschek@prealize.de,+491715320755,https://cdn.bookingkit.de/vendor_images/b8543b...,Allgemeine Geschäftsbedingungen (AGB) \nfür On...,DE,DE,2021-01-11T14:55:36Z
2332,596beda6badc3e62973bfa2f762c827c,1,-1,Live Seminar zum Thema - Erfolgreicher Vertrie...,In 4 ausgewählten und erfolgsorientierten Modu...,BOOKING,180.0,600.0,10,"Berlin, Deutschland",...,4cb7bc2b927ffc1018c491dfc34071e7,Prealize GmbH,www.prealize.de,cornelia.jeschek@prealize.de,+491715320755,https://cdn.bookingkit.de/vendor_images/b8543b...,Allgemeine Geschäftsbedingungen (AGB) \nfür On...,DE,DE,2021-01-11T14:55:36Z
2333,858e3a3137fc7b4ad1fa967fafd8dd2e,1,-1,Live Seminar zum Thema - Erfolgreicher Vertrie...,In 4 ausgewählten und erfolgsorientierten Modu...,BOOKING,180.0,600.0,10,"Berlin, Deutschland",...,4cb7bc2b927ffc1018c491dfc34071e7,Prealize GmbH,www.prealize.de,cornelia.jeschek@prealize.de,+491715320755,https://cdn.bookingkit.de/vendor_images/b8543b...,Allgemeine Geschäftsbedingungen (AGB) \nfür On...,DE,DE,2021-01-11T14:55:36Z


# Remarks and insights
- will all those fiels be available with other datasets
- duplicates event id? or because of merge?
- what is pretime?

Features useful :
- title
- location (adress, lon, lat, meeting location)
- adress pos / city pos / country pos
- name or url or service email or phone
- description
- vendor id
- vendor name
- categories/i/title or id

maybe useful
- duration
- bring
- advice
- hint
- participant hint
- highlights/0
- prices




# Encoding

In [35]:
df_merge['description']

0       Zu cool, um wahr zu sein? Von wegen! Die BRAVO...
1       Wir bringen Euch den Rätselspaß nach Hause!\n ...
2       Wir bringen Euch den Rätselspaß nach Hause!\nK...
3       Ob zum Geburtstag, zu Weihnachten, zur Hochzei...
4       Wir bringen Euch den Rätselspaß nach Hause!\nK...
                              ...                        
2330    Lassen Sie sich zum Mentoren Spezialisten ausb...
2331    In 4 ausgewählten und erfolgsorientierten Modu...
2332    In 4 ausgewählten und erfolgsorientierten Modu...
2333    In 4 ausgewählten und erfolgsorientierten Modu...
2334    In 4 ausgewählten und erfolgsorientierten Modu...
Name: description, Length: 1638, dtype: object

In [36]:
# Load the regular expression library
import re

# Remove punctuation
df_merge['description_processed'] = \
df_merge['description'].map(lambda x: re.sub('[,\.!?\n]', '', str(x)))

# Convert the titles to lowercase
df_merge['description_processed'] = \
df_merge['description_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
df_merge['description_processed'].head()

0    zu cool um wahr zu sein von wegen die bravo sp...
1    wir bringen euch den rätselspaß nach hause kli...
2    wir bringen euch den rätselspaß nach hauseklic...
3    ob zum geburtstag zu weihnachten zur hochzeit ...
4    wir bringen euch den rätselspaß nach hauseklic...
Name: description_processed, dtype: object

In [37]:
import gensim
from gensim.utils import simple_preprocess
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
df_merge = df_merge.description_processed.values.tolist()
data_words = list(sent_to_words(df_merge))
print(data_words[:1][0][:30])

['zu', 'cool', 'um', 'wahr', 'zu', 'sein', 'von', 'wegen', 'die', 'bravo', 'sport', 'fußballcamps', 'bieten', 'alles', 'wir', 'sind', 'per', 'du', 'mit', 'den', 'superstars', 'des', 'fußballs', 'hochste', 'zeit', 'unser', 'know', 'how', 'weiterzugeben', 'trainieren']


In [38]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

### Remove Stopwords, Make Bigrams and Lemmatize


In [39]:
# NLTK Stop words
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('german')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use','zu','und','kurs','tour','gut','mögen','uber','kommen','heute','grundlagen',
#                   'konnen','nehmen','informationen','direkt','moglichkeit','verschieden','erhalten','hand','perfekt','immer','lernen'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [40]:
import spacy
# Remove Stop Words
import de_core_news_md
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("de_core_news_md", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])


[['cool', 'wahren', 'bieten', 'hochste', 'zeit', 'weitergeben', 'trainieren', 'profis', 'lauten', 'motto', 'naturlich', 'geben', 'ja', 'schließlich', 'spaß', 'verbinden', 'mega', 'perfekte', 'morgen', 'madchen', 'altern', 'jahren']]


In [41]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]]


### Base lda model 

In [42]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [43]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"massage" + 0.006*"hotel" + 0.006*"erlesen" + 0.006*"mittagessen" + '
  '0.006*"fleisch" + 0.006*"abend" + 0.005*"rucken" + 0.005*"zubereiten" + '
  '0.005*"fisch" + 0.005*"frisch"'),
 (1,
  '0.012*"mitarbeitend" + 0.009*"mitarbeiter_innen" + 0.007*"kommen" + '
  '0.007*"spiel" + 0.006*"erhalten" + 0.006*"gutschein" + 0.006*"vision" + '
  '0.005*"ziel" + 0.005*"kunden" + 0.004*"freude"'),
 (2,
  '0.006*"arabisch" + 0.006*"beruhmten" + 0.006*"kommen" + 0.005*"immer" + '
  '0.005*"welt" + 0.005*"zeigen" + 0.004*"zeit" + 0.004*"bord" + 0.004*"groß" '
  '+ 0.004*"abend"'),
 (3,
  '0.011*"burj_al" + 0.011*"uber" + 0.010*"gehen" + 0.010*"arab" + '
  '0.007*"dubai" + 0.007*"genießen" + 0.007*"tour" + 0.007*"geben" + '
  '0.006*"aussicht" + 0.006*"fur"'),
 (4,
  '0.017*"kurs" + 0.009*"sonnenuntergang" + 0.008*"nehmen" + 0.007*"bestehen" '
  '+ 0.007*"sonderrabatt" + 0.007*"moglichkeit" + 0.007*"shop_einzukaufen" + '
  '0.006*"erhalten" + 0.005*"torte" + 0.005*"hause_hierfur"'),
 

What is topic coherence?
Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and topics that are artifacts of statistical inference. But

In [44]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4205471150684338


In [49]:
# supporting function
def compute_coherence_values(corpus, dictionary, k):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [50]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 10
max_topics = 41
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=30)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # get the coherence score for the given parameters
            cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k)
         # Save the model results
            model_results['Validation_Set'].append(corpus_title[i])
            model_results['Topics'].append(k)
            model_results['Coherence'].append(cv)
                    
            pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()



  0%|          | 0/30 [00:46<?, ?it/s][A[A
  0%|          | 0/30 [00:26<?, ?it/s]


  3%|▎         | 1/30 [00:09<04:38,  9.60s/it][A[A

  7%|▋         | 2/30 [00:17<04:16,  9.17s/it][A[A

 10%|█         | 3/30 [00:28<04:17,  9.54s/it][A[A

 13%|█▎        | 4/30 [00:37<04:06,  9.50s/it][A[A

 17%|█▋        | 5/30 [00:45<03:48,  9.15s/it][A[A

 20%|██        | 6/30 [00:54<03:33,  8.89s/it][A[A

 23%|██▎       | 7/30 [01:02<03:20,  8.70s/it][A[A

 27%|██▋       | 8/30 [01:10<03:09,  8.60s/it][A[A

 30%|███       | 9/30 [01:19<02:59,  8.56s/it][A[A

 33%|███▎      | 10/30 [01:27<02:51,  8.55s/it][A[A

 37%|███▋      | 11/30 [01:36<02:44,  8.66s/it][A[A

 40%|████      | 12/30 [01:45<02:37,  8.77s/it][A[A

 43%|████▎     | 13/30 [01:54<02:30,  8.84s/it][A[A

 47%|████▋     | 14/30 [02:03<02:22,  8.93s/it][A[A

 50%|█████     | 15/30 [02:13<02:15,  9.05s/it][A[A

 53%|█████▎    | 16/30 [02:22<02:08,  9.19s/it][A[A

 57%|█████▋    | 17/30 [02:32<02:01,  9.3

KeyboardInterrupt: 

In [51]:
res = pd.DataFrame(model_results)
res

Unnamed: 0,Validation_Set,Topics,Coherence
0,75% Corpus,10,0.410487
1,75% Corpus,11,0.463718
2,75% Corpus,12,0.377786
3,75% Corpus,13,0.470641
4,75% Corpus,14,0.442653
5,75% Corpus,15,0.438493
6,75% Corpus,16,0.435545
7,75% Corpus,17,0.459359
8,75% Corpus,18,0.508864
9,75% Corpus,19,0.470359


In [52]:
res[res['Coherence']==res['Coherence'].max()]

Unnamed: 0,Validation_Set,Topics,Coherence
8,75% Corpus,18,0.508864


In [53]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=18, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.91)

### Let's train the lda with optimal params

In [54]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
stop words cities 1st category>

Choisir chaque category dyna et ensuite lda 

