###  GDELT dataset

In [5]:
# download data
import pandas as pd
df=pd.read_csv("MN-DS-news-classification.csv",index_col=0) #Web scrapping articles 
print(df.shape)
df.head(3)

(10917, 12)


Unnamed: 0_level_0,id,date,source,title,content,author,url,published,published_utc,collection_utc,category_level_1,category_level_2
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1809,abcnews--2019-10-31--Virginia mom charged with...,2019-10-31,abcnews,Virginia mom charged with murder in 2-year-old...,The Virginia woman whose 2-year-old son was fo...,,https://abcnews.go.com/US/wireStory/virginia-m...,"Thu, 31 Oct 2019 16:49:56 -0400",1572554996,1572559512,"crime, law and justice",crime
1980,abcnews--2019-11-07--2 escaped murder suspects...,2019-11-07,abcnews,2 escaped murder suspects arrested at US-Mexic...,Authorities are trying to determine if anyone ...,,https://abcnews.go.com/US/wireStory/escaped-mu...,"Thu, 07 Nov 2019 00:13:12 -0500",1573103592,1573131986,"crime, law and justice",crime
1995,abcnews--2019-11-07--Family turns in escaped b...,2019-11-07,abcnews,"Family turns in escaped boy, 13, suspected in ...",A 13-year-old suspect in a double homicide who...,,https://abcnews.go.com/US/wireStory/family-tur...,"Thu, 07 Nov 2019 07:39:54 -0500",1573130394,1573131982,"crime, law and justice",crime


In [6]:
from sentence_transformers import SentenceTransformer
import umap
from keybert import KeyBERT
import plotly.express as px
from sklearn.cluster import KMeans

### Data Preprocesing

In [7]:
import re

# Convert text to lowercase, handling NaN values
df['content'] = df.content.apply(lambda x: x.lower() if pd.notna(x) else x)
# Remove numbers from the text, handling NaN values
df['content'] = df.content.apply(lambda x: re.sub(r'\d+', '', x) if pd.notna(x) else x)
# Remove punctuation from the text, handling NaN values
df['content'] = df.content.apply(lambda x: re.sub(r'[^\w\s]', ' ', x) if pd.notna(x) else x)
# Remove leading and trailing whitespaces, handling NaN values
df['content'] = df.content.apply(lambda x: x.strip() if pd.notna(x) else x)

In [8]:
def clean(raw):
    """ Remove hyperlinks and markup """
    # Check if text is not NaN
    if pd.notna(raw):
        result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
        #result= re.sub("[^a-zA-Z]","",str(result)) # Search for all non-letters  " ",   # Replace all non-letters with spaces
        result = re.sub('&gt;', "", result)
        result = re.sub('&#x27;', "'", result)
        result = re.sub('&quot;', '"', result)
        result = re.sub('&#x2F;', ' ', result)
        result = re.sub('<p>', ' ', result)
        result = re.sub('</i>', '', result)
        result = re.sub('&#62;', '', result)
        result = re.sub('<i>', ' ', result)
        result = re.sub("\n", '', result)
        result = re.sub("\t", '', result)
        return result
    else:
        return raw  # Return NaN value as is

# Apply the clean function to the 'Text' column
df['content'] = df['content'].apply(clean)

In [9]:
from langdetect import detect

# Assuming df is your DataFrame and 'Text' is the column containing text data

# Replace NaN values in the 'Text' column with an empty string
df["content"] = df["content"].fillna("")

# Filter rows with English text
def detect_language(text):
    try:
        if len(str(text)) < 3:  # Adjust the threshold as needed
            return "Too Short"
        return detect(str(text))
    except:
        return "Unknown"

df["Language"] = df["content"].apply(detect_language)
df_filtered = df[df["Language"] == "en"]

In [10]:
print(df_filtered.shape)
df_filtered.head(2)

(10909, 13)


Unnamed: 0_level_0,id,date,source,title,content,author,url,published,published_utc,collection_utc,category_level_1,category_level_2,Language
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1809,abcnews--2019-10-31--Virginia mom charged with...,2019-10-31,abcnews,Virginia mom charged with murder in 2-year-old...,the virginia woman whose year old son was fou...,,https://abcnews.go.com/US/wireStory/virginia-m...,"Thu, 31 Oct 2019 16:49:56 -0400",1572554996,1572559512,"crime, law and justice",crime,en
1980,abcnews--2019-11-07--2 escaped murder suspects...,2019-11-07,abcnews,2 escaped murder suspects arrested at US-Mexic...,authorities are trying to determine if anyone ...,,https://abcnews.go.com/US/wireStory/escaped-mu...,"Thu, 07 Nov 2019 00:13:12 -0500",1573103592,1573131986,"crime, law and justice",crime,en


##### KeyBERT

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()
titles_list = df_filtered.content.tolist()
titles_keys = kw_model.extract_keywords(titles_list)
df_filtered["kewords"] = titles_keys
df_filtered.head(2)
pd.set_option('display.max_colwidth', 30)
df_filtered.head(2)  

df_filtered['keys_length'] = df_filtered['kewords'].apply(lambda x: len(x))
df_filtered.head(2)  

In [None]:
df5 = df_filtered[df_filtered.keys_length == 5]
df5.drop('keys_length', axis=1, inplace=True)
df5.to_csv("parsed_5_keys_file.csv", index=False)
print(df5.shape)
df5.head(1)

In [None]:
import pandas as pd

def extract_keywords(keyword_list):
    return [item[0] for item in keyword_list]

def extract_scores(keyword_list):
    return [float(item[1]) for item in keyword_list]

df5['Keywords'] = df5['kewords'].apply(extract_keywords)
df5['Scores'] = df5['kewords'].apply(extract_scores)

df5.drop(columns=['kewords'], inplace=True)
df5.head(3)


In [17]:
df5.columns 

Index(['id', 'date', 'source', 'title', 'content', 'author', 'url',
       'published', 'published_utc', 'collection_utc', 'category_level_1',
       'category_level_2', 'Language', 'Keywords', 'Scores'],
      dtype='object')

In [None]:
# Create a list of all sublists of keywords and keyphrases
df5_keys = df5.Keywords.tolist()
# Flatten the list of sublists
flat_keys = [item for sublist in df5_keys for item in sublist]
# Create a DataFrame with the distinct keywords and scores
keys_df = pd.DataFrame(flat_keys, columns=['key'])
# Count how many distinct keywords were extracted
print(keys_df.shape)
keys_df.head()


In [None]:
keys_df=df5
keys_df.head(3)

#### Embeddings

In [21]:
# download the sentence embeddings model
embedder = SentenceTransformer('all-mpnet-base-v2')

In [None]:
# Function to embed keywords
import numpy as np
def embed_keywords(keywords):
    return embedder.encode(keywords)
keys_df['Embeddings'] = keys_df['Keywords'].apply(embed_keywords)
print(keys_df['Embeddings'].shape)

keys_df.to_csv("MNDS_ST_embedding.csv")
keys_df.head(3)

##### Reduce Embeddings Size: UMAP

In [None]:
import numpy as np
import umap

# Convert list representations of embeddings to numpy arrays
embeddings_array = np.array([np.array(embedding) for embedding in keys_df['Embeddings']])
# Reshape the array to 2D if needed
if embeddings_array.ndim == 3:
    embeddings_array = embeddings_array.reshape(embeddings_array.shape[0], -1)

# Use UMAP algorithm
reduced_embeddings = umap.UMAP(n_neighbors=15,
                               n_components=10,
                               metric='cosine').fit_transform(embeddings_array)
# Add the reduced embeddings to the dataframe
keys_df['key_umap'] = reduced_embeddings.tolist()

# Check the output
keys_df.head(3)

##### Determine #clsuters with elbow method

In [None]:
# Elbow Method for K means# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer

# Convert to NumPy array
embedding_values_array = np.array(keys_df["key_umap"].tolist())
    
# Reduce dimensionality using UMAP
umap_embedding = umap.UMAP().fit_transform(embedding_values_array)

model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,25), timings= True)
visualizer.fit(umap_embedding)        # Fit data to visualizer
visualizer.show()