### Get Data (News articles) from GDELT

In [None]:
import gdelt

# Version 1 queries
gd1 = gdelt.gdelt(version=1)

# pull single day, gkg table
df1= gd1.Search(['2024 March 25','2024 March 27'],coverage=True,table='gkg')
df1.shape

In [None]:
df2.to_excel ("gdelt_March28-30_2024.xlsx")

In [15]:
import pandas as pd
df2=pd.read_excel("gdelt_March28-30_2024.xlsx",index_col=0)

In [16]:
df2.shape

(213934, 5)

In [17]:
df2.columns

Index(['DATE', 'THEMES', 'LOCATIONS', 'TONE', 'SOURCEURLS'], dtype='object')

In [18]:
df2.head()

Unnamed: 0,DATE,THEMES,LOCATIONS,TONE,SOURCEURLS
1,20240329,TAX_FNCACT;TAX_FNCACT_NOMINEES;MEDIA_SOCIAL;TA...,"1#Ghana#GH#GH#8#-2#GH;4#Accra, Greater Accra, ...","1.52905198776758,5.19877675840979,3.6697247706...",https://www.modernghana.com/news/1302487/boga-...
2,20240329,WB_1920_FINANCIAL_SECTOR_DEVELOPMENT;WB_363_FI...,1#Poland#PL#PL#52#20#PL;1#Luxembourg#LU#LU#49....,"-2.85306704707561,1.85449358059914,4.707560627...",https://www.theregister.com/2024/03/29/amazon_...
3,20240329,TAX_ETHNICITY;TAX_ETHNICITY_BLACK;NATURAL_DISA...,"4#Waterloo, Ontario, Canada#CA#CA08#43.4667#-8...","-1.24792013311148,1.99667221297837,3.244592346...",https://www.salon.com/2024/03/28/orange-is-the...
4,20240329,LEADER;TAX_FNCACT;TAX_FNCACT_PRESIDENT;USPEC_P...,1#United States#US#US#39.828175#-98.5795#US;1#...,"0.835073068893529,4.38413361169102,3.549060542...",https://www.thedailybeast.com/newsmax-helps-do...
5,20240329,CRISISLEX_CRISISLEXREC;RECRUITMENT;TAX_FNCACT;...,"2#Connecticut, United States#US#USCT#41.5834#-...","0.504032258064516,3.32661290322581,2.822580645...",https://www.12news.com/article/news/nation-wor...


In [None]:
# Select 15k samples, 5k from each date value
selected_rows = pd.concat([df2[df2['DATE'] == '20240327'].head(5000),
                           df2[df2['DATE'] == '20240326'].head(5000),
                           df2[df2['DATE'] == '20240325'].head(5000)])


print(selected_rows.shape)
selected_rows.head()

### Extract news texts and titles  from SOURCEURLS

In [None]:
import pandas as pd
import newspaper
from newspaper import news_pool
from tqdm import tqdm

df = pd.read_excel('gdelt_March28-30_2024_15k.xlsx')

def extract_text(url):
    try:
        article = newspaper.Article(url)
        article.download()
        article.parse()
        text = article.text.strip()
        return text, 'Success'
    except Exception as e:
        print(f"Error fetching URL: {url}. Error: {str(e)}")
        return None, None

texts = []
for url in tqdm(df['SOURCEURLS'], desc="Extracting texts"):
    text, status = extract_text(url)
    texts.append(text)

df['Text'] = texts

# Drop the "Unnamed: " column from the DataFrame
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

df = df.dropna(subset=['Text'])

print("Total number of articles extracted:", len(texts))
print("DataFrame shape:", df.shape)

df.head()

In [None]:
#f.to_excel("gdelt_extracted_MR_08_2024.xlsx")
#df.to_csv ("GDELT_March_2024_10k_dataset.csv")

In [34]:
df3=pd.read_csv("GDELT_March_2024_10k_dataset.csv", index_col=0)

#pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_colwidth', 30)
#print(df3.shape)
df3.head()

Unnamed: 0,DATE,SOURCEURLS,Title,Text
0,20240328,https://www.yahoo.com/news...,Rain and snow in Wisconsin...,"In the last week, heavy sn..."
1,20240328,https://www.washingtontime...,Youngkin vetoes Virginia b...,"RICHMOND, Va. — Republican..."
2,20240328,https://www.journal-news.c...,GDPR Support,logo_ddn_tag_Site JN with ...
4,20240328,https://www.winnipegfreepr...,Nigerian parents finally g...,"KADUNA, Nigeria (AP) — Par..."
5,20240328,https://godanriver.com/lif...,Taking the kids: Get ready...,Mark your calendars! There...


### Clustering Newspaper Articles from GDELT

### Import packages 

In [1]:
#pip install datasets sentence-transformers umap-learn hdbscan keybert
# manage data
from datasets import load_dataset
import pandas as pd

# embeddings
from sentence_transformers import SentenceTransformer

# dimensionality reduction
import umap

# clustering
import hdbscan

# extract keywords from texts
# used to assign meaningful names to clusters
from keybert import KeyBERT

# visualization
import plotly.express as px

  from pandas.core import (


In [35]:
# download data
import pandas as pd
df=pd.read_csv("GDELT_March_2024_10k_dataset.csv",index_col=0) #Web scrapping articles 
#print(df.shape)
df.head()

Unnamed: 0,DATE,SOURCEURLS,Title,Text
0,20240328,https://www.yahoo.com/news...,Rain and snow in Wisconsin...,"In the last week, heavy sn..."
1,20240328,https://www.washingtontime...,Youngkin vetoes Virginia b...,"RICHMOND, Va. — Republican..."
2,20240328,https://www.journal-news.c...,GDPR Support,logo_ddn_tag_Site JN with ...
4,20240328,https://www.winnipegfreepr...,Nigerian parents finally g...,"KADUNA, Nigeria (AP) — Par..."
5,20240328,https://godanriver.com/lif...,Taking the kids: Get ready...,Mark your calendars! There...


The dataset contains 120k articles. Let’s keep only 3k of them to make computations faster in this project.

## Data Preprocesing

In [3]:
import re

# Convert text to lowercase, handling NaN values
df['Text'] = df.Text.apply(lambda x: x.lower() if pd.notna(x) else x)
# Remove numbers from the text, handling NaN values
#df['Text'] = df.Text.apply(lambda x: re.sub(r'\d+', '', x) if pd.notna(x) else x)
# Remove punctuation from the text, handling NaN values
#df['Text'] = df.Text.apply(lambda x: re.sub(r'[^\w\s]', ' ', x) if pd.notna(x) else x)
# Remove leading and trailing whitespaces, handling NaN values
df['Text'] = df.Text.apply(lambda x: x.strip() if pd.notna(x) else x)

In [4]:
def clean(raw):
    """ Remove hyperlinks and markup """
    # Check if text is not NaN
    if pd.notna(raw):
        result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
        #result= re.sub("[^a-zA-Z]","",str(result)) # Search for all non-letters  " ",   # Replace all non-letters with spaces
        result = re.sub('&gt;', "", result)
        result = re.sub('&#x27;', "'", result)
        result = re.sub('&quot;', '"', result)
        result = re.sub('&#x2F;', ' ', result)
        result = re.sub('<p>', ' ', result)
        result = re.sub('</i>', '', result)
        result = re.sub('&#62;', '', result)
        result = re.sub('<i>', ' ', result)
        result = re.sub("\n", '', result)
        result = re.sub("\t", '', result)
        return result
    else:
        return raw  # Return NaN value as is

# Apply the clean function to the 'Text' column
df['Text'] = df['Text'].apply(clean)

#### Remove non-English words

In [5]:
from langdetect import detect

# Assuming df is your DataFrame and 'Text' is the column containing text data

# Replace NaN values in the 'Text' column with an empty string
df["Text"] = df["Text"].fillna("")

# Filter rows with English text
def detect_language(text):
    try:
        if len(str(text)) < 3:  # Adjust the threshold as needed
            return "Too Short"
        return detect(str(text))
    except:
        return "Unknown"

df["Language"] = df["Text"].apply(detect_language)
df_filtered = df[df["Language"] == "en"]

In [37]:
#print(df_filtered.shape)
#df_filtered.head()

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()
keyword_list = df_filtered.Text.tolist()
keyword_keys = kw_model.extract_keywords(keyword_list)
df_filtered["keyword_keys"] = keyword_keys

In [12]:
df_filtered.columns

Index(['DATE', 'SOURCEURLS', 'Title', 'Text', 'Language', 'keyword_keys'], dtype='object')

In [14]:
df_filtered=df_filtered[['DATE', 'SOURCEURLS', 'Title', 'Text', 'keyword_keys']]
df_filtered.head(3)

Unnamed: 0,DATE,SOURCEURLS,Title,Text,keyword_keys
0,20240328,https://www.yahoo.com/news/rain-snow-wisconsin...,Rain and snow in Wisconsin offer some relief f...,"in the last week, heavy snow and rain througho...","[(drought, 0.5093), (droughts, 0.4863), (wisco..."
1,20240328,https://www.washingtontimes.com/news/2024/mar/...,Youngkin vetoes Virginia bills mandating minim...,"richmond, va. — republican virginia gov. glenn...","[(marijuana, 0.4246), (cannabis, 0.3914), (leg..."
2,20240328,https://www.journal-news.com/nation-world/a-ti...,GDPR Support,logo_ddn_tag_site jn with tagline logo-sns_tag...,"[(logo_ddn_tag_site, 0.7676), (sns_tag_siteour..."


### LLM for Keyword extraction

In [None]:
import openai
from keybert.llm import OpenAI
from keybert import KeyLLM

In [None]:
# Create your LLM
openai.api_key = "key"

prompt = """
 find  keywords  that best describe  text.
"""
llm = OpenAI(model="gpt-4", prompt=prompt, chat=True)

# Load it in KeyLLM
kw_model = KeyLLM(llm)

titles_list = df_filtered.Text.tolist()
# Extract keywords
keywords = kw_model.extract_keywords(titles_list, check_vocab=True); keywords

# Add the results to df
df_filtered["kewords"] = keywords
time.sleep(30)

In [None]:
# Save the data and the results to a file
df_filtered.to_csv("extracted_keys_file_keyLLM.csv", index=False)

In [15]:
# Take a look at the output
pd.set_option('display.max_colwidth', 20)
df_filtered.head()     

Unnamed: 0,DATE,SOURCEURLS,Title,Text,keyword_keys
0,20240328,https://www.yaho...,Rain and snow in...,in the last week...,"[(drought, 0.509..."
1,20240328,https://www.wash...,Youngkin vetoes ...,"richmond, va. — ...","[(marijuana, 0.4..."
2,20240328,https://www.jour...,GDPR Support,logo_ddn_tag_sit...,[(logo_ddn_tag_s...
4,20240328,https://www.winn...,Nigerian parents...,"kaduna, nigeria ...","[(kaduna, 0.5333..."
5,20240328,https://godanriv...,Taking the kids:...,mark your calend...,"[(eclipse, 0.470..."


In [38]:
import pandas as pd
import ast

# Create a column that records the number of keywords/keyphrases
df_filtered['keys_length'] = df_filtered['keyword_keys'].apply(lambda x: len(x))
#df_filtered.head() 

In [None]:
# Keep those titles that have 5 keywords/keyphrases only
df5 = df_filtered[df_filtered.keys_length == 5]
df5.drop('keys_length', axis=1, inplace=True)
# Save the data to a csv file
df5.to_csv("parsed_5_keys_file.csv", index=False)
df5.head()

In [20]:
# Create a list of all sublists of keywords and keyphrases
df5_keys = df5.keyword_keys.tolist()
flat_keys = [item for sublist in df5_keys for item in sublist]

# Create a list of unique keywords
flat_keys = list(set(flat_keys))
keys_df = pd.DataFrame(flat_keys, columns = ['key','score'])
keys_df.shape     

(38020, 2)

In [21]:
# Display a sample of keywords or keyphrases
keys_df.head()     

Unnamed: 0,key,score
0,hengshui,0.4632
1,miller,0.4315
2,oscars,0.3332
3,sewage,0.3001
4,fares,0.3618


### LLM embedding: using text-embedding-ada-002

In [13]:
import os
import openai

# Set your OpenAI API key
openai.api_key = "key"
#  access the API key through os.environ
os.environ['OPENAI_API_KEY'] = openai.api_key


In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

keys_df['LLM_embedding'] = keys_df.key.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
#keys_df.to_csv('gdelt_title_text_with_LLM_embedding.csv', index=0) # save the embedding

In [29]:
import pandas as pd
keys_df=pd.read_csv("gdelt_title_text_with_LLM_embedding.csv") #Web scrapping articles 
keys_df.columns=["Key","Score","LLM_embeddings"]
print(keys_df.shape)
keys_df.head()

(18514, 3)


Unnamed: 0,Key,Score,LLM_embeddings
0,hyundai,0.4551,[-0.012657809071...
1,jailing,0.3627,[-0.023275019600...
2,nicknames,0.3657,[-0.032408613711...
3,currencies,0.3676,[-0.004608497489...
4,politician,0.3417,[-0.019050309434...


### Reduce Embeddings Size
For visualization purposes, we will reduce the dimensionality of the embeddings to 2 using UMAP. It's important to note that UMAP's performance can be sensitive to its hyperparameters. 

In [None]:
import ast

# Convert string representations of lists to actual lists
keys_df['LLM_embeddings'] = keys_df['LLM_embeddings'].apply(ast.literal_eval)

# Use UMAP algorithm
embeddings = umap.UMAP(n_neighbors=15,
                       n_components=10,
                       metric='cosine').fit_transform(keys_df['LLM_embeddings'].tolist())
print(embeddings)

# Add the reduced embeddings to the dataframe
keys_df['key_umap'] = embeddings.tolist()

# Check the output
keys_df.head(2)

In [33]:
print(keys_df.columns)
keys_df.head(2)

Index(['Key', 'Score', 'LLM_embeddings', 'key_umap'], dtype='object')


Unnamed: 0,Key,Score,LLM_embeddings,key_umap
0,hyundai,0.4551,[-0.012657809071...,[3.7995724678039...
1,jailing,0.3627,[-0.023275019600...,[6.0693130493164...


### Elbow Method

In [20]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.cluster import KMeans
import numpy as np

In [21]:
df=keys_df.loc[:,['key', 'score', 'ada_embedding', 'key_umap']]

In [None]:
# Elbow Method
from yellowbrick.cluster import KElbowVisualizer

embedding_values_array = np.array(df["key_umap"].tolist())
    
umap_embedding = umap.UMAP().fit_transform(embedding_values_array)

model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,15), timings= True)
visualizer.fit(umap_embedding)        # Fit data to visualizer
visualizer.show()       # Finalize and render figure