In [1]:
import nltk; 
import ssl
from pprint import pprint

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')

! pip install spacy
#python -m spacy download en_core_web_sm
#import spacy
#nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])




[nltk_data] Downloading package stopwords to /Users/avtk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
import re
import numpy as np
import pandas as pd


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
#import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [4]:
df_uk = pd.read_csv("../../data/processed/UK.csv")
df_us = pd.read_csv("../../data/processed/US.csv")
df_combined = pd.concat([df_uk,df_us])
print(df_combined.head(), df_combined.columns)
print(df_combined.columns)

                                         description  \
0  The strategy goes well beyond defeating an opp...   
1  All the conflict in Gaza is achieving is civil...   
2  Mexico has launched its army-run airline, with...   
3                           This blog is now closed.   
4  Palestinian Islamic Jihad posts videos of two ...   

                                            maintext  \
0  How to make sense of the sheer intensity of Is...   
1  All the conflict in Gaza is achieving is civil...   
2  For free real time breaking news alerts sent s...   
3  From 23 Dec 2023 18.49 CET UNRWA: 'People in G...   
4  The BBC will not be broadcasting the clip itse...   

               source_domain  \
0        www.theguardian.com   
1  www.unitedkingdomnews.net   
2      www.independent.co.uk   
3        www.theguardian.com   
4              www.bbc.co.uk   

                                               title  \
0  Israel’s use of disproportionate force is a lo...   
1  Israel cant defeat

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

words = list(sent_to_words(df_combined.title.values))

In [6]:
words

[['israel',
  'use',
  'of',
  'force',
  'is',
  'long',
  'established',
  'tactic',
  'with',
  'clear',
  'aim'],
 ['israel', 'cant', 'defeat', 'hamas', 'in', 'battle', 'so', 'what', 'next'],
 ['mexico',
  'army',
  'run',
  'airline',
  'takes',
  'to',
  'the',
  'skies',
  'with',
  'first',
  'flight',
  'to',
  'the',
  'resort',
  'of',
  'tulum'],
 ['death',
  'toll',
  'from',
  'israeli',
  'attacks',
  'tops',
  'as',
  'it',
  'happened'],
 ['israel', 'sees', 'sign', 'of', 'life', 'in', 'gaza', 'hostage', 'video'],
 ['putin',
  'left',
  'scrambling',
  'as',
  'russia',
  'monthly',
  'losses',
  'comparable',
  'to',
  'wwi'],
 ['us',
  'launches',
  'airstrikes',
  'on',
  'iran',
  'backed',
  'terrorists',
  'in',
  'iraq',
  'in',
  'retaliation',
  'for',
  'missile',
  'attack',
  'on',
  'american',
  'airbase',
  'that',
  'injured',
  'troops'],
 ['chinese',
  'performers',
  'bring',
  'cultural',
  'diversity',
  'to',
  'london',
  'lord',
  'mayor',
  'sho

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
#print(trigram_mod)

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [9]:
data_words_nostops = remove_stopwords(words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [10]:
data_words_bigrams

[['israel', 'use', 'force', 'long', 'established', 'tactic', 'clear', 'aim'],
 ['israel', 'cant', 'defeat', 'hamas', 'battle', 'next'],
 ['mexico',
  'army',
  'run',
  'airline',
  'takes',
  'skies',
  'first',
  'flight',
  'resort',
  'tulum'],
 ['death_toll', 'israeli', 'attacks', 'tops', 'happened'],
 ['israel', 'sees', 'sign', 'life', 'gaza', 'hostage', 'video'],
 ['putin',
  'left',
  'scrambling',
  'russia',
  'monthly',
  'losses',
  'comparable',
  'wwi'],
 ['us',
  'launches',
  'airstrikes',
  'iran_backed',
  'terrorists',
  'iraq',
  'retaliation',
  'missile',
  'attack',
  'american',
  'airbase',
  'injured',
  'troops'],
 ['chinese',
  'performers',
  'bring',
  'cultural',
  'diversity',
  'london',
  'lord',
  'mayor',
  'show'],
 ['israel',
  'responds',
  'force',
  'support',
  'hamas',
  'soars',
  'west_bank',
  'october',
  'attack'],
 ['china',
  'issues',
  'furious',
  'rebuke',
  'biden',
  'branded',
  'president',
  'xi',
  'dictator',
  'hours',
  'pa

In [11]:
id2word = corpora.Dictionary(data_words_bigrams)
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]

In [12]:
# human readable format
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('aim', 1),
  ('clear', 1),
  ('established', 1),
  ('force', 1),
  ('israel', 1),
  ('long', 1),
  ('tactic', 1),
  ('use', 1)]]

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.049*"hostages" + 0.042*"china" + 0.036*"hospital" + 0.020*"pentagon" + '
  '0.016*"south" + 0.016*"un" + 0.015*"official" + 0.015*"time" + 0.014*"amid" '
  '+ 0.014*"talks"'),
 (1,
  '0.022*"day" + 0.020*"conflict" + 0.017*"kills" + 0.016*"christmas" + '
  '0.014*"ceasefire" + 0.014*"keep" + 0.014*"al_shifa" + 0.013*"launch" + '
  '0.012*"special" + 0.012*"calls"'),
 (2,
  '0.078*"us" + 0.062*"says" + 0.047*"military" + 0.029*"attack" + '
  '0.024*"troops" + 0.023*"new" + 0.021*"forces" + 0.020*"attacks" + '
  '0.017*"houthi" + 0.015*"iraq"'),
 (3,
  '0.078*"yemen" + 0.059*"strikes" + 0.025*"houthi_rebels" + 0.024*"red_sea" + '
  '0.023*"drones" + 0.021*"led" + 0.017*"linked" + 0.013*"announces" + '
  '0.013*"shipping" + 0.012*"coast"'),
 (4,
  '0.132*"israel" + 0.109*"gaza" + 0.095*"hamas" + 0.078*"war" + '
  '0.047*"israeli" + 0.014*"hostage" + 0.012*"fighting" + 0.010*"cease_fire" + '
  '0.009*"dead" + 0.007*"video"'),
 (5,
  '0.059*"iran" + 0.030*"netanyahu" + 0.030*"coul

In [15]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.090413809355207

Coherence Score:  0.4756395679789014


In [16]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')   
#vis






In [17]:
data_words_nostops = remove_stopwords(words)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

id2word = corpora.Dictionary(data_words_trigrams)
texts = data_words_trigrams
corpus = [id2word.doc2bow(text) for text in texts]

In [18]:
# human readable format
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('aim', 1),
  ('clear', 1),
  ('established', 1),
  ('force', 1),
  ('israel', 1),
  ('long', 1),
  ('tactic', 1),
  ('use', 1)]]

In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.047*"yemen" + 0.024*"pentagon" + 0.023*"weapons" + 0.021*"group" + '
  '0.020*"two" + 0.019*"shot" + 0.018*"week" + 0.018*"near" + 0.017*"drones" + '
  '0.016*"one"'),
 (1,
  '0.049*"houthi" + 0.040*"red_sea" + 0.032*"ship" + 0.030*"missiles" + '
  '0.027*"navy" + 0.020*"uk" + 0.020*"official" + 0.017*"zelenskyy" + '
  '0.016*"zelensky" + 0.014*"officials"'),
 (2,
  '0.137*"us" + 0.068*"biden" + 0.051*"attack" + 0.042*"troops" + '
  '0.034*"attacks" + 0.027*"iraq" + 0.019*"trump" + 0.012*"cease_fire" + '
  '0.011*"middle_east" + 0.011*"rocket"'),
 (3,
  '0.093*"russian" + 0.032*"ukrainian" + 0.030*"putin" + 0.024*"border" + '
  '0.021*"hit" + 0.020*"christmas" + 0.017*"market" + 0.014*"claims" + '
  '0.014*"kill" + 0.014*"billion"'),
 (4,
  '0.096*"ukraine" + 0.066*"russia" + 0.063*"military" + 0.029*"china" + '
  '0.018*"defense" + 0.017*"aid" + 0.012*"kyiv" + 0.012*"live_updates" + '
  '0.011*"report" + 0.008*"allies"'),
 (5,
  '0.082*"hostages" + 0.033*"palestinian" + 0.03

In [21]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_trigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.131267170247964

Coherence Score:  0.4820479950852216


In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')   



