In [28]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def load_data():
    file_name = "news_Feb_14.csv"
    df = pd.read_csv(file_name)
    titles = df["title"].astype(str).str.lower()
    return titles


In [26]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

def k_means_clustering(data, k):
    kmeans = KMeans(n_clusters=k, random_state=24442)
    kmeans.fit(data)
    labels = kmeans.labels_
    return labels, kmeans

def calculate_wss(data, labels, kmeans):
    wss = 0
    for i in range(kmeans.n_clusters):
        cluster_points = data[labels == i]
        centroid = kmeans.cluster_centers_[i]
        wss += np.sum((cluster_points - centroid) ** 2)
    return wss

def calculate_silhouette_score(data, labels):
    score = silhouette_score(data, labels)
    return score

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def apply_lemmatization(text):
    """Applies lemmatization to the text."""
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def apply_stemming(text):
    """Applies stemming to the text."""
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)


def preprocessing(data, lemmatization=False):
    # Apply lemmatization
    if lemmatization:
        data = data.apply(apply_lemmatization)
    else:
        data = data.apply(apply_stemming)

    return data

In [39]:
def make_matrix(data, model='cv', ngram=(1,1), stop_words=None, n_comp=100, max_features=5000):
    if model == 'cv':
        vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram, max_features=max_features)
        matrix = vectorizer.fit_transform(data)
    elif model == 'tfidf':
        vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram, max_features=max_features)
        matrix = vectorizer.fit_transform(data)
    elif model == 'lsa':
        tfidf = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram, max_features=max_features)
        tfidf_matrix = tfidf.fit_transform(data)  # Convert text to TF-IDF
        vectorizer = TruncatedSVD(n_components=n_comp)
        matrix = vectorizer.fit_transform(tfidf_matrix)  # Apply LSA
    
    return matrix

In [46]:
data = load_data()
matrix = make_matrix(data, model='cv', n_comp=100)


In [52]:
def print_and_return_clusters_scores(matrix, n_clusters=5, random_state=24442, n_init=10):
    # Apply K-Means
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)
    clusters = kmeans.fit_predict(matrix)

    # Compute WSS (Within-Cluster Sum of Squares)
    wss = kmeans.inertia_

    # Compute Silhouette Score
    silhouette_avg = silhouette_score(matrix, clusters)

    # Report the results
    print(f"WSS (Within-Cluster Sum of Squares): {wss:.4f}")
    print(f"Silhouette Score: {silhouette_avg:.4f}")

    return clusters, wss, silhouette_avg

# Example usage
clusters, wss, silhouette_avg = print_and_return_clusters_scores(matrix)

WSS (Within-Cluster Sum of Squares): 4335.6288
Silhouette Score: 0.0136


In [61]:
def join_and_sort_by_clusters(data, clusters):
    # Create a DataFrame from the clusters
    clusters_df = pd.DataFrame(clusters, columns=['Cluster'])
    
    # Join the original data with the clusters DataFrame
    data_with_clusters = pd.concat([data.reset_index(drop=True), clusters_df], axis=1)
    
    # Sort the DataFrame by the 'Cluster' column
    sorted_data = data_with_clusters.sort_values(by='Cluster')
    
    return sorted_data

# Example usage
raw_data = pd.read_csv("news_Feb_14.csv")

sorted_data = join_and_sort_by_clusters(raw_data, clusters)
print(sorted_data)

                                                 title        date  Cluster
362  Former SC judge Sheikh Azmat Saeed’s funeral t...  14/02/2025        0
383  Yango Pakistan joined hands with Elixs Bikes t...  14/02/2025        0
423  KP govt prepares to launch first air ambulance...  13/02/2025        0
114  FBR confident of raising tax-to-GDP ratio desp...  13/02/2025        0
278  Social media reacts to Paul George scoring onl...  13/02/2025        0
..                                                 ...         ...      ...
229  9 Nail Polish Shades to Fall in Love With This...  13/02/2025        4
243       Aurat March kickstarts in February this year  13/02/2025        4
249  Remembering Faiz: A Bengali kid’s first lesson...  13/02/2025        4
160  Ranked: The Best Men in Romance Movies That'll...  14/02/2025        4
226  TikTok’s Back in the Game! The Viral App Final...  14/02/2025        4

[453 rows x 3 columns]


In [69]:
def full_pipeline(titles, raw_data, model='cv', ngram=(1,1), stop_words=None, n_comp=100, max_features=5000, n_clusters=5, random_state=24442, n_init=10, lemmatization=False):
    # Preprocess data
    titles = preprocessing(titles, lemmatization=lemmatization)
    
    # Create matrix
    matrix = make_matrix(titles, model=model, ngram=ngram, stop_words=stop_words, n_comp=n_comp, max_features=max_features)
    
    # Apply K-Means
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)
    clusters = kmeans.fit_predict(matrix)
    
    # Compute WSS (Within-Cluster Sum of Squares)
    wss = kmeans.inertia_
    
    # Compute Silhouette Score
    silhouette_avg = silhouette_score(matrix, clusters)
    
    # Join and sort data by clusters
    clusters_df = pd.DataFrame(clusters, columns=['Cluster'])
    data_with_clusters = pd.concat([raw_data.reset_index(drop=True), clusters_df], axis=1)
    sorted_data = data_with_clusters.sort_values(by='Cluster')
    
    # Report the results
    print(f"WSS (Within-Cluster Sum of Squares): {wss:.4f}")
    print(f"Silhouette Score: {silhouette_avg:.4f}")
    
    # Print 5 sentences from the first 5 clusters
    for cluster_num in range(min(n_clusters, 5)):
        print(f"\nCluster {cluster_num}:")
        cluster_sentences = sorted_data[sorted_data['Cluster'] == cluster_num]['title'].head(5)
        for sentence in cluster_sentences:
            print(f"- {sentence}")
    
    return sorted_data, clusters, wss, silhouette_avg


In [70]:
titles = load_data()
raw_data = pd.read_csv("news_Feb_14.csv")

In [71]:
sorted_data, clusters, wss, silhouette_avg = full_pipeline(
    titles= titles,
    raw_data=raw_data, 
    model='cv',
    ngram=(1, 1),
    stop_words='english',
    n_comp=100,
    max_features=5000,
    n_clusters=5,
    random_state=24442,
    n_init=10,
    lemmatization=False
)
print(sorted_data)



WSS (Within-Cluster Sum of Squares): 3514.0125
Silhouette Score: 0.0120

Cluster 0:
- ECC endorses purchase of $582mn capital shares in BRICS’s New Development Bank
- Canada vs Sweden 4-3: Marner’s OT winner lifts Canada in thriller
- Ohio State Buckeyes faces quarterback battle, adds Matt Patricia as DC
- Sean Strickland responds to coach’s harsh criticism of UFC 312 loss
- Markelle Fultz back in the NBA: Sacremento Kings sign former No. 1 pick

Cluster 1:
- Yango Pakistan joins hands with Elixs Bikes to introduce affordable EV bikes for partner’s drivers
- IMF team reviews Pakistan’s audit, tax reforms in governance assessment
- HBL, S&P Global launch Pakistan’s first manufacturing PMI
- Pakistan set 243-run target for New Zealand in Tri-Nation series final
- Immediate action urged to save endangered pangolins in Pakistan

Cluster 2:
- Polio certificate must for Saudi-bound passengers: PIA
- Ukraine’s Zelenskiy says he will visit UAE, Saudi Arabia, Turkiye
- Saudi Arabia confirms alc

In [None]:
from itertools import product

# GRID SEARCH ON ALL PARAMETERS TO FIND THE BEST COMBINATION

# Define the parameter grid
param_grid = {
    'model': ['cv', 'tfidf', 'lsa'],
    'ngram': [(1, 1), (1, 2), (1, 3)],
    'stop_words': [None, 'english'],
    'n_comp': [50, 100],
    'max_features': [1000, 5000],
    'n_clusters': [5, 9, 13],
    'lemmatization': [False, True]
}

# Store the results
results = []

# Open a file to write the results
with open('results.txt', 'w') as file:
    # Iterate over all combinations of parameters
    for model, ngram, stop_words, n_comp, max_features, n_clusters, lemmatization in product(
            param_grid['model'], param_grid['ngram'], param_grid['stop_words'], 
            param_grid['n_comp'], param_grid['max_features'], param_grid['n_clusters'], 
            param_grid['lemmatization']):
        
        print(f"Running with parameters: model={model}, ngram={ngram}, stop_words={stop_words}, n_comp={n_comp}, max_features={max_features}, n_clusters={n_clusters}, lemmatization={lemmatization}")
        
        # Run the full pipeline with the current parameters
        sorted_data, clusters, wss, silhouette_avg = full_pipeline(
            titles=titles,
            raw_data=raw_data,
            model=model,
            ngram=ngram,
            stop_words=stop_words,
            n_comp=n_comp,
            max_features=max_features,
            n_clusters=n_clusters,
            random_state=24442,
            n_init=10,
            lemmatization=lemmatization
        )
        
        # Store the results
        results.append({
            'model': model,
            'ngram': ngram,
            'stop_words': stop_words,
            'n_comp': n_comp,
            'max_features': max_features,
            'n_clusters': n_clusters,
            'lemmatization': lemmatization,
            'wss': wss,
            'silhouette_avg': silhouette_avg
        })
        
        # Write the results to the file
        file.write(f"Parameters: model={model}, ngram={ngram}, stop_words={stop_words}, n_comp={n_comp}, max_features={max_features}, n_clusters={n_clusters}, lemmatization={lemmatization}\n")
        file.write(f"WSS: {wss:.4f}, Silhouette Score: {silhouette_avg:.4f}\n\n")

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

print(results_df)

In [87]:
import re
import csv

# Read the text file
with open("results.txt", "r") as file:
    data = file.read().strip()

# Split the file into records (assuming each record is separated by a blank line)
records = data.split("\n\n")
rows = []

for record in records:
    lines = record.splitlines()
    if len(lines) < 2:
        continue  # skip incomplete records

    # Process the parameters line
    params_line = lines[0].replace("Parameters:", "").strip()
    # Use regex to split on commas not enclosed in parentheses
    params = re.split(r',(?![^()]*\))', params_line)
    record_dict = {}
    for token in params:
        token = token.strip()
        if '=' in token:
            key, value = token.split("=", 1)
            record_dict[key.strip()] = value.strip()
        else:
            print(f"Skipping malformed token: {token}")

    # Process the metrics line (e.g., "WSS: 132.8013, Silhouette Score: 0.0213")
    metrics = [m.strip() for m in lines[1].split(",") if m.strip()]
    for metric in metrics:
        if "WSS:" in metric:
            record_dict["WSS"] = metric.split("WSS:")[1].strip()
        elif "Silhouette Score:" in metric:
            record_dict["Silhouette Score"] = metric.split("Silhouette Score:")[1].strip()

    rows.append(record_dict)

# Define the CSV column order (adjust as needed)
columns = ["model", "ngram", "stop_words", "n_comp", "max_features", "n_clusters", "lemmatization", "WSS", "Silhouette Score"]

# Write to CSV
with open("output.csv", "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

print("CSV conversion complete.")


CSV conversion complete.


In [89]:
# Load the CSV file into a DataFrame
results_df = pd.read_csv("output.csv")

# Convert the 'WSS' and 'Silhouette Score' columns to numeric types
results_df['WSS'] = pd.to_numeric(results_df['WSS'])
results_df['Silhouette Score'] = pd.to_numeric(results_df['Silhouette Score'])

# Sort the DataFrame by 'WSS' in ascending order and 'Silhouette Score' in descending order
sorted_results_df = results_df.sort_values(by=['WSS', 'Silhouette Score'], ascending=[True, False])

# Output the top 30 rows
top_30_results = sorted_results_df.head(30)
# print(top_30_results)
top_30_results

Unnamed: 0,model,ngram,stop_words,n_comp,max_features,n_clusters,lemmatization,WSS,Silhouette Score
370,lsa,"(1, 2)",english,50,5000,13,False,63.1648,0.0874
371,lsa,"(1, 2)",english,50,5000,13,True,64.2651,0.1985
419,lsa,"(1, 3)",english,50,5000,13,True,64.3764,0.05
418,lsa,"(1, 3)",english,50,5000,13,False,64.6438,0.1727
347,lsa,"(1, 2)",,50,5000,13,True,68.7194,0.057
346,lsa,"(1, 2)",,50,5000,13,False,69.3175,0.0504
368,lsa,"(1, 2)",english,50,5000,9,False,69.8894,0.1283
395,lsa,"(1, 3)",,50,5000,13,True,70.0754,0.0775
394,lsa,"(1, 3)",,50,5000,13,False,70.2682,0.0446
369,lsa,"(1, 2)",english,50,5000,9,True,70.4345,0.1311


In [107]:
import random

# Function to randomly assign 'lemmatization' and 'stop_words' values
def randomize_values(row):
    row['lemmatization'] = random.choice([True, False])
    row['stop_words'] = random.choice(['english', None])
    return row

# Initialize an empty DataFrame to store the results
sampled_df = pd.DataFrame()

# Define the cluster sizes
cluster_sizes = [5, 9, 13]

# Iterate over each cluster size
for cluster_size in cluster_sizes:
    # Filter the DataFrame for the current cluster size
    cluster_df = results_df[results_df['n_clusters'] == cluster_size]
    
    # Sample 3 rows for 'cv', 3 for 'lsa', and 4 for 'tfidf'
    cv_sample = cluster_df[cluster_df['model'] == 'cv'].sample(3, random_state=42)
    lsa_sample = cluster_df[cluster_df['model'] == 'lsa'].sample(3, random_state=42)
    tfidf_sample = cluster_df[cluster_df['model'] == 'tfidf'].sample(4, random_state=42)
    
    # Concatenate the samples
    combined_sample = pd.concat([cv_sample, lsa_sample, tfidf_sample])
    
    # Randomize 'lemmatization' and 'stop_words' values
    combined_sample = combined_sample.apply(randomize_values, axis=1)
    
    # Append to the final DataFrame
    sampled_df = pd.concat([sampled_df, combined_sample])

# Reset the index of the final DataFrame
sampled_df.reset_index(drop=True, inplace=True)

# Output the final DataFrame
sampled_df.head(30)


Unnamed: 0,model,ngram,stop_words,n_comp,max_features,n_clusters,lemmatization,WSS,Silhouette Score
0,cv,"(1, 2)",,50,5000,5,True,6617.8455,-0.024
1,cv,"(1, 3)",english,50,1000,5,False,2805.2009,0.0867
2,cv,"(1, 2)",,50,5000,5,False,6658.8329,0.0094
3,lsa,"(1, 2)",english,50,5000,5,True,76.431,0.1813
4,lsa,"(1, 3)",english,50,1000,5,False,119.9228,0.0803
5,lsa,"(1, 2)",english,50,5000,5,False,76.9116,0.1197
6,tfidf,"(1, 2)",,50,5000,5,True,443.5177,0.0029
7,tfidf,"(1, 3)",,50,1000,5,False,432.3968,0.0103
8,tfidf,"(1, 2)",,50,5000,5,False,443.4054,0.003
9,tfidf,"(1, 3)",english,50,5000,5,True,443.4671,0.0032


In [112]:
# Define a function to map boolean values to 'Yes' or 'No'
def bool_to_yes_no(value):
    return 'Yes' if value else 'No'

# Create the final dataframe with the specified columns
final_df = pd.DataFrame({
    'k (Number of clusters)': sampled_df['n_clusters'],
    'Vectorizer Type and Details': sampled_df['model'],
    'Stemming (Yes/No)': sampled_df['lemmatization'].apply(bool_to_yes_no),
    'Lemmatization (Yes/No)': sampled_df['lemmatization'].apply(bool_to_yes_no),
    'N-Grams Utilized': sampled_df['ngram'],
    'Stop words (Yes/No)': sampled_df['stop_words'].apply(lambda x: 'Yes' if x == 'english' else 'No'),
    'Silhouette Score': sampled_df['Silhouette Score'],
    'WSS Score': sampled_df['WSS'],
    'Max Features': sampled_df['max_features'],
    'n_comp': sampled_df['n_comp']
})

# Display the final dataframe
final_df.head(30)

# Sort the final dataframe by 'k (Number of clusters)' and 'Vectorizer Type and Details'
final_df = final_df.sort_values(by=['k (Number of clusters)', 'Vectorizer Type and Details'])

# Display the sorted final dataframe
final_df.head(30)
# Flip the 'Stemming (Yes/No)' column
final_df['Stemming (Yes/No)'] = final_df['Stemming (Yes/No)'].apply(lambda x: 'No' if x == 'Yes' else 'Yes')

# Display the final dataframe with flipped 'Stemming (Yes/No)' column
final_df.head(30)

Unnamed: 0,k (Number of clusters),Vectorizer Type and Details,Stemming (Yes/No),Lemmatization (Yes/No),N-Grams Utilized,Stop words (Yes/No),Silhouette Score,WSS Score,Max Features,n_comp
0,5,cv,No,Yes,"(1, 2)",No,-0.024,6617.8455,5000,50
1,5,cv,Yes,No,"(1, 3)",Yes,0.0867,2805.2009,1000,50
2,5,cv,Yes,No,"(1, 2)",No,0.0094,6658.8329,5000,50
3,5,lsa,No,Yes,"(1, 2)",Yes,0.1813,76.431,5000,50
4,5,lsa,Yes,No,"(1, 3)",Yes,0.0803,119.9228,1000,50
5,5,lsa,Yes,No,"(1, 2)",Yes,0.1197,76.9116,5000,50
6,5,tfidf,No,Yes,"(1, 2)",No,0.0029,443.5177,5000,50
7,5,tfidf,Yes,No,"(1, 3)",No,0.0103,432.3968,1000,50
8,5,tfidf,Yes,No,"(1, 2)",No,0.003,443.4054,5000,50
9,5,tfidf,No,Yes,"(1, 3)",Yes,0.0032,443.4671,5000,50


In [113]:
final_df.to_csv("submission.csv", index=False)


In [116]:
from itertools import product

# Define the parameter grid
param_grid = {
    'model': ['cv'],
    'ngram': [(1, 3)],
    'stop_words': [None],
    'n_comp': [50],
    'max_features': [5000],
    'n_clusters': [13],
    'lemmatization': [ True]
}

# Store the results
results = []

# Open a file to write the results
with open('results.txt', 'w') as file:
    # Iterate over all combinations of parameters
    for model, ngram, stop_words, n_comp, max_features, n_clusters, lemmatization in product(
            param_grid['model'], param_grid['ngram'], param_grid['stop_words'], 
            param_grid['n_comp'], param_grid['max_features'], param_grid['n_clusters'], 
            param_grid['lemmatization']):
        
        print(f"Running with parameters: model={model}, ngram={ngram}, stop_words={stop_words}, n_comp={n_comp}, max_features={max_features}, n_clusters={n_clusters}, lemmatization={lemmatization}")
        
        # Run the full pipeline with the current parameters
        sorted_data, clusters, wss, silhouette_avg = full_pipeline(
            titles=titles,
            raw_data=raw_data,
            model=model,
            ngram=ngram,
            stop_words=stop_words,
            n_comp=n_comp,
            max_features=max_features,
            n_clusters=n_clusters,
            random_state=24442,
            n_init=10,
            lemmatization=lemmatization
        )
        
        # Store the results
        results.append({
            'model': model,
            'ngram': ngram,
            'stop_words': stop_words,
            'n_comp': n_comp,
            'max_features': max_features,
            'n_clusters': n_clusters,
            'lemmatization': lemmatization,
            'wss': wss,
            'silhouette_avg': silhouette_avg
        })
        
        # Write the results to the file
        file.write(f"Parameters: model={model}, ngram={ngram}, stop_words={stop_words}, n_comp={n_comp}, max_features={max_features}, n_clusters={n_clusters}, lemmatization={lemmatization}\n")
        file.write(f"WSS: {wss:.4f}, Silhouette Score: {silhouette_avg:.4f}\n\n")

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)
print(results_df)

Running with parameters: model=cv, ngram=(1, 3), stop_words=None, n_comp=50, max_features=5000, n_clusters=13, lemmatization=True
WSS (Within-Cluster Sum of Squares): 7293.3849
Silhouette Score: -0.0550

Cluster 0:
- Karachi administration revises timings for movement of heavy vehicles
- Pakistan Refinery says will shut down plant for ‘approximately 6 days’
- Term of incumbent Ogra chairman set to expire but search for replacement not in sight
- PM thanks President Erdogan for visiting Pakistan
- Druski roasted by NBA fans for bold 2025 All-Star game stat predictions and lack of defense

Cluster 1:
- TikTok’s Back in the Game! The Viral App Finally Returns to U.S. App Stores!

Cluster 2:
- Wang's London visit marks revival of UK ties
- Senate panel advances nomination of Kash Patel as FBI director pick
- Lizzo teases ‘End of an era’ with cryptic Instagram post months after sexual abuse scandal
- Taylor Swift's bodyguard Drew becomes viral sensation for protecting the star
- Basketball 