In [6]:
! pip install gensim 
! pip install pyldavis
! pip install matplotlib






In [7]:
import matplotlib.pyplot as plt 
import logging 
import pandas as pd
import json 
import glob 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import spmatrix 
import pyLDAvis.gensim_models
import pyLDAvis
import pyLDAvis.lda_model
pyLDAvis.enable_notebook()
logger = logging.getLogger()



In [8]:
import glob 
def data_compactor(data_dir: str) -> None:
    """Read files from dir and combine into a dataframe

    Args:
        data_dir (str): name of sub folder to extract data from 

    Returns:
        None: writes the dataframe to a csv file
    """
    data_list = []
    try:
        for name in glob.glob(f"../../data/raw/{data_dir}/*.json"):
            article = json.load(open(name,'r'))
            data_list.append(article)
    except FileNotFoundError as e: 
        logging.error(f"File not found: {e}")
    else:   
        df = pd.DataFrame(data_list)
        req_cols = ['description', 'maintext', 'source_domain', 'title', 'url', 'language', 'date_publish']
        df = df[req_cols]
        df.to_csv(f"../../data/processed/{data_dir}.csv", index=False)
    return

In [9]:
data_compactor("UK")
#data_compactor("MiddleEast")
data_compactor("US")

In [11]:
df_UK = pd.read_csv("../../data/processed/UK.csv")
df_US = pd.read_csv("../../data/processed/US.csv")

In [12]:
df_UK_titles = df_UK["title"]

In [20]:
def get_tfidf_scores(raw_df: pd.DataFrame)-> tuple:
    """Function to generate TF-IDF

    Args:
        raw_df (pd.DataFrame): _description_
    """
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    stop_words = 'english',
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    max_df = 0.5, 
                                    min_df = 0.01
                                    )
    tf = tf_vectorizer.fit_transform(raw_df)
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    tfidf= tfidf_vectorizer.fit_transform(raw_df)
    return tf,tfidf,tfidf_vectorizer

In [27]:
def lda_topic_modelling(tf: spmatrix, tfidf: spmatrix) -> tuple:
    """Function to run LDA on tf and tfidf matrices for the dataframes

    Args:
        tf (spmatrix): Term Frequency Matrix 
        tfidf (spmatrix): TFIDF Matrix 

    Returns:
        : _description_
    """
    # initialse model for term frequency matrix
    lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
    lda_tf.fit(tf)

    # initialise model for tfidf matrix 
    lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
    lda_tfidf.fit(tfidf)

    return lda_tf, lda_tfidf

In [28]:
tf, tf_idf, tf_vectorizer = get_tfidf_scores(df_UK_titles)
lda_tf, lda_tfidf = lda_topic_modelling(tf, tf_idf)



In [29]:
for topic_idx, topic in enumerate(lda_tfidf.components_):
    print("Topic %d:" % (topic_idx))
    print(" ".join([tf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
happened aid gaza israel war says biden say hostages president
Topic 1:
ukraine russia attack putin soldiers war american nato ukrainian troops
Topic 2:
israel hamas idf new war gaza missile leader terror terrorists
Topic 3:
market research billion reach forecast stratview says japan aircraft korea
Topic 4:
gaza israeli hospital israel forces palestinians killed shifa hostage says
Topic 5:
russian army west people hezbollah dead bank border israeli plane
Topic 6:
gaza ceasefire calls netanyahu palestinian israel minister hamas shows civilians
Topic 7:
day pro conflict palestine london world group video know state
Topic 8:
military china support east talks kyiv navy ukraine weapons middle
Topic 9:
yemen sea red british houthi iran attacks drone ship houthis


In [30]:
pyLDAvis.lda_model.prepare(lda_tf, tf, tf_vectorizer)


