In [4]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models, similarities
import glob
import re

stopwords_en = stopwords.words('english')
stopwords_en.extend(['also','say'])
lemmatizer = WordNetLemmatizer()

def preprocessing(file):
    content = " ".join(file.readlines())
    articles = content.split('\n __ARTICLE__\n')[:-1]
    articles = [item.lower() for item in articles]
    articles = [re.sub('[^a-z]', ' ', item) for item in articles]
    articles = [item.split() for item in articles]
    articles = [[lemmatizer.lemmatize(t, pos='v') for t in item if t not in stopwords_en and len(t) > 2] for item in articles]
    return articles

In [5]:
dataset = glob.glob('articles\\*.txt')
print(dataset)

['articles\\2018_1.txt', 'articles\\2018_10.txt', 'articles\\2018_11.txt', 'articles\\2018_12.txt', 'articles\\2018_2.txt', 'articles\\2018_3.txt', 'articles\\2018_4.txt', 'articles\\2018_5.txt', 'articles\\2018_6.txt', 'articles\\2018_7.txt', 'articles\\2018_8.txt', 'articles\\2018_9.txt', 'articles\\2019_1.txt', 'articles\\2019_10.txt', 'articles\\2019_11.txt', 'articles\\2019_12.txt', 'articles\\2019_2.txt', 'articles\\2019_3.txt', 'articles\\2019_4.txt', 'articles\\2019_5.txt', 'articles\\2019_6.txt', 'articles\\2019_7.txt', 'articles\\2019_8.txt', 'articles\\2019_9.txt']


In [6]:
documents = []
for item in dataset:
    f = open(item, mode='r', encoding='utf-8')
    tokens = preprocessing(f)
    documents.extend(tokens)

In [7]:
dictionary = corpora.Dictionary(documents)
corpus_doc2bow_vectors = [dictionary.doc2bow(doc) for doc in documents]

tfidf_model = models.TfidfModel(corpus_doc2bow_vectors, id2word=dictionary, normalize=False)
corpus_tfidf_vectors = tfidf_model[corpus_doc2bow_vectors]

# run LDA model

In [8]:
lda_model = models.LdaMulticore(corpus_doc2bow_vectors, num_topics=10, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"morocco" + 0.007*"moroccan" + 0.007*"women" + 0.006*"say" + 0.005*"rabat" + 0.005*"year" + 0.005*"world" + 0.004*"president" + 0.004*"people" + 0.004*"make"
Topic: 1 
Words: 0.020*"morocco" + 0.010*"say" + 0.007*"moroccan" + 0.006*"sahara" + 0.006*"rabat" + 0.005*"polisario" + 0.005*"western" + 0.005*"algeria" + 0.005*"political" + 0.004*"government"
Topic: 2 
Words: 0.015*"morocco" + 0.009*"percent" + 0.007*"moroccan" + 0.006*"increase" + 0.006*"say" + 0.005*"year" + 0.005*"development" + 0.005*"rabat" + 0.004*"report" + 0.004*"mad"
Topic: 3 
Words: 0.011*"morocco" + 0.010*"moroccan" + 0.007*"team" + 0.006*"festival" + 0.006*"world" + 0.005*"say" + 0.005*"make" + 0.004*"first" + 0.004*"year" + 0.004*"rabat"
Topic: 4 
Words: 0.020*"morocco" + 0.006*"say" + 0.006*"african" + 0.005*"security" + 0.005*"africa" + 0.005*"polisario" + 0.004*"moroccan" + 0.004*"countries" + 0.004*"iran" + 0.004*"rabat"
Topic: 5 
Words: 0.016*"morocco" + 0.010*"moroccan" + 0.008*"say" +

# LDA TF-IDF

In [11]:
lda_model_tfidf = models.LdaMulticore(corpus_tfidf_vectors, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.004*"polisario" + 0.004*"sahara" + 0.003*"western" + 0.003*"algeria" + 0.003*"conflict" + 0.002*"morocco" + 0.002*"king" + 0.002*"african" + 0.002*"political" + 0.002*"africa"

Topic: 1 Word: 0.004*"police" + 0.004*"suspect" + 0.004*"migrants" + 0.003*"arrest" + 0.003*"spanish" + 0.003*"bouteflika" + 0.003*"drug" + 0.003*"dgsn" + 0.002*"migration" + 0.002*"traffic"

Topic: 2 Word: 0.002*"energy" + 0.002*"iran" + 0.002*"company" + 0.002*"african" + 0.002*"countries" + 0.002*"project" + 0.002*"morocco" + 0.002*"price" + 0.002*"africa" + 0.002*"bourita"

Topic: 3 Word: 0.002*"percent" + 0.002*"raissouni" + 0.002*"women" + 0.002*"abortion" + 0.002*"morocco" + 0.002*"mad" + 0.001*"increase" + 0.001*"company" + 0.001*"couscous" + 0.001*"world"

Topic: 4 Word: 0.004*"teachers" + 0.004*"football" + 0.004*"team" + 0.003*"club" + 0.003*"referee" + 0.003*"game" + 0.003*"caf" + 0.002*"match" + 0.002*"play" + 0.002*"wydad"

Topic: 5 Word: 0.003*"flight" + 0.002*"women" + 0.002*"att