In [1]:
import os

INPUT_DIR = "../data/nat-ai/text/"
filenames = os.listdir(INPUT_DIR)

texts = []  # list of documents
for filename in filenames:
    with open(INPUT_DIR+filename) as f:
        texts.append(f.read())  # read the whole document

len(texts)

19

In [2]:
from stopwordsiso import stopwords

stopwords = stopwords("en")

more_stopwords = {
    'affect', 'change', 'doe', 'greeting', 'ha', 'herse', 'himse', 'itse', 'key', 'le', 'myse', 'regard', 'result', 'sha', 'ti', 'wa', 'wo', 'word'
}
stopwords = stopwords.union(more_stopwords)

must_include_words = {
    'ai', 'eu'
}
stopwords = stopwords - must_include_words

len(stopwords)

1315

In [3]:
"ai" in stopwords

False

In [4]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/arthit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/arthit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/arthit/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

compounds = [
    "artificial intelligence",
    "european union",
    "european parliament",
    "national strategy",
    "government data",
    "public data",
    "public sector",
    "action plan",
]
# compound_tokens = [w.split() for w in compounds]

abbreviations = {
    "ai": "artificial intelligence",
    "eu": "european union",
}

exclude_words = {
    "artificial intelligence",
}

def merge_tokens(tokens, compounds):
    tokens_len = len(tokens)
    i = 0
    while i < tokens_len-1:
        if tokens[i] + " " + tokens[i+1] in compounds:
            new_token = tokens[i] + " " + tokens[i+1]
            new_tokens = tokens[:i] + [new_token] + tokens[i+2:]
            tokens = new_tokens
            tokens_len = len(tokens)
            i = i + 1
        i = i + 1
    return tokens

def replace_tokens(tokens, abbreviations):
    return [abbreviations[t] if t in abbreviations else t for t in tokens]

def exclude_tokens(tokens, exclude_words):
    return [t for t in tokens if t not in exclude_words]

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokens = [self.wnl.lemmatize(t) for t in word_tokenize(doc) if (t.isalpha() and len(t) >= 2)]
        tokens = merge_tokens(tokens, compounds)
        tokens = replace_tokens(tokens, abbreviations)
        tokens = exclude_tokens(tokens, exclude_words)
        return tokens

bow_vectorizer = CountVectorizer(stop_words=stopwords, tokenizer=LemmaTokenizer(), lowercase=True, min_df=10)
bow_matrix = bow_vectorizer.fit_transform(texts)

bow_matrix.shape

(19, 785)

In [6]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda_bow  = LDA(n_components=10, random_state=42) 
lda_bow.fit(bow_matrix)

LatentDirichletAllocation(random_state=42)

In [7]:
lda_bow.transform(bow_matrix[:2])

array([[1.43844445e-05, 3.96437387e-03, 1.43828786e-05, 8.69820043e-01,
        1.43838037e-05, 1.15621691e-01, 1.47196509e-03, 1.43843620e-05,
        1.43802141e-05, 9.05001140e-03],
       [5.19903692e-05, 5.19893818e-05, 5.19827735e-05, 1.30193483e-01,
        5.19881678e-05, 1.41779764e-01, 1.35881863e-02, 6.28401008e-01,
        5.19750572e-05, 8.57776334e-02]])

In [8]:
for idx, topic in enumerate(lda_bow.components_):
    print(f"Top 5 words in Topic #{idx+1}:")
    print([bow_vectorizer.get_feature_names()[i] for i in topic.argsort()[-5:]]) 
    print("")

Top 5 words in Topic #1:
['public', 'technology', 'programme', 'government', 'data']

Top 5 words in Topic #2:
['public', 'data', 'support', 'measure', 'digital']

Top 5 words in Topic #3:
['deep', 'programme', 'learning', 'european union', 'oecd']

Top 5 words in Topic #4:
['support', 'technology', 'strategy', 'data', 'development']

Top 5 words in Topic #5:
['develop', 'benefit', 'approach', 'national', 'data']

Top 5 words in Topic #6:
['sector', 'development', 'digital', 'public', 'data']

Top 5 words in Topic #7:
['development', 'business', 'technology', 'government', 'data']

Top 5 words in Topic #8:
['national', 'data', 'vision', 'technology', 'strategy']

Top 5 words in Topic #9:
['procedure', 'base', 'monitor', 'automotive', 'coordinate']

Top 5 words in Topic #10:
['company', 'data', 'development', 'application', 'ministry']





In [9]:
import pyLDAvis 
import pyLDAvis.sklearn 

pyLDAvis.enable_notebook()

  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):


In [10]:
pyLDAvis.sklearn.prepare(lda_bow, bow_matrix, bow_vectorizer)

  and should_run_async(code)
