In [6]:
# Importing libraries

import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Loading data

df = pd.read_csv('habibi_cleaned_trainset_.csv')
df = df.dropna()
df

Unnamed: 0,id,local_id,link,title,content,gold_label
0,1,1,https://urdu.arynews.tv/car-sales-in-pakistan/,پاکستان میں گاڑیوں کی فروخت میں بڑا اضافہ,ملکی آٹو سیکٹر س زبردست خبر آگئی پاکستان میں ...,Business
1,2,2,https://urdu.arynews.tv/gold-rates-in-pakistan-3/,پاکستان میں سون کی قیمت آج کتنی کم ہوئی,کراچی کاروباری ہفت ک پہل روز سون کی قیمت میں ک...,Business
2,3,5,https://urdu.arynews.tv/cotton-production-cott...,امریکا س معیاری روئی کی درآمد بڑھ گئی,کراچی پاکستان میں کپاس کی پیداوار میں کمی ک با...,Business
3,4,3,https://urdu.arynews.tv/psx-today-11-nov/,پاکستان اسٹاک ایکسچینج میں نئی تاریخ رقم,پاکستان اسٹاک ایکسچینج ن ایک اور سنگ میل عبور ...,Business
4,5,4,https://urdu.arynews.tv/ghee-and-cooking-oil-p...,عوام ک لی نئی مشکل گھی اور کوکنگ آئل کی قیمتوں...,لاہور گھی اور کوکنگ آئل کی قیمتوں میں ایک بار ...,Business
...,...,...,...,...,...,...
5836,5939,492,https://jang.com.pk/news/1015053,کراچی یونیورسٹی ن ہاکی فائنل جیت لیا,ہائرایجوکیشن کمیشن ایچ ای سی زون جی ہاکی چیمپئ...,Sports
5837,5940,495,https://jang.com.pk/news/1012389,قوم کو ایک بار پھر کرکٹ ن متحد کردیا,کئی سالوں ک بعد پہلی بار ایسا ہوا ہ کہ پاکستان...,Sports
5838,5941,496,https://jang.com.pk/news/1012388,جامعہ این ای ڈی میں فلڈ لائٹ کرکٹ گراؤنڈ کا اف...,رواں ماہ ہر جانب ورلڈ کپ کی گہما گہمی جاری ہ ب...,Sports
5839,5942,498,https://jang.com.pk/news/1012386,اسپورٹس مقابل,الفا ایجوکیشن نیٹ ورک پر جہاں نصابی سرگرمیاں ب...,Sports


In [8]:
# Preparing the Urdu strings

# Loading Urdu stopwords from the json file
with open('k_sw.json', 'r', encoding='utf-8') as file:
    urdu_stopwords = set(json.load(file).keys())

# Function to clean our Urdu sentences
def clean_content(text, stopwords):
    # Remove punctuation and numbers
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stopwords)
    # Convert to lowercase
    text = text.lower()
    return text

df['content'] = df['content'].apply(lambda x: clean_content(x, urdu_stopwords))

df.head()

Unnamed: 0,id,local_id,link,title,content,gold_label
0,1,1,https://urdu.arynews.tv/car-sales-in-pakistan/,پاکستان میں گاڑیوں کی فروخت میں بڑا اضافہ,ملکی آٹو سیکٹر س زبردست خبر آگئی پاکستان میں ...,Business
1,2,2,https://urdu.arynews.tv/gold-rates-in-pakistan-3/,پاکستان میں سون کی قیمت آج کتنی کم ہوئی,کراچی کاروباری ہفت پہل روز سون قیمت میں کمی کا...,Business
2,3,5,https://urdu.arynews.tv/cotton-production-cott...,امریکا س معیاری روئی کی درآمد بڑھ گئی,کراچی پاکستان میں کپاس پیداوار میں کمی باعث اس...,Business
3,4,3,https://urdu.arynews.tv/psx-today-11-nov/,پاکستان اسٹاک ایکسچینج میں نئی تاریخ رقم,پاکستان اسٹاک ایکسچینج ن سنگ میل عبور کر لیا ک...,Business
4,5,4,https://urdu.arynews.tv/ghee-and-cooking-oil-p...,عوام ک لی نئی مشکل گھی اور کوکنگ آئل کی قیمتوں...,لاہور گھی کوکنگ آئل قیمتوں میں بار اضافہ جس بع...,Business


In [9]:
"""
# 80/20 train test split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['gold_label'])

# 80/20 train validation split
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42, stratify=train_data['gold_label'])


print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")
print(f"Test set size: {len(test_data)}")
"""

'\n# 80/20 train test split\ntrain_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df[\'gold_label\'])\n\n# 80/20 train validation split\ntrain_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42, stratify=train_data[\'gold_label\'])\n\n\nprint(f"Training set size: {len(train_data)}")\nprint(f"Validation set size: {len(validation_data)}")\nprint(f"Test set size: {len(test_data)}")\n'

In [10]:
from collections import Counter
from scipy.sparse import csr_matrix
import numpy as np

def tokenize(text):
    """
    Tokenize the text into words, splitting by whitespace.
    """
    return text.split()

def generate_ngrams(text, n=1):
    """
    Generate n-grams from a given text.
    """
    tokens = tokenize(text)
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def manual_count_vectorizer(corpus, ngram_range=(1, 1)):
    """
    Manually implements CountVectorizer for a given corpus.
    Returns a sparse matrix of counts and the feature names.
    """
    ngram_min, ngram_max = ngram_range
    vocab = Counter()

    # Generate n-grams for each document and update the vocabulary
    ngram_list_per_doc = []
    for doc in corpus:
        doc_ngrams = []
        for n in range(ngram_min, ngram_max + 1):
            doc_ngrams.extend(generate_ngrams(doc, n))
        ngram_list_per_doc.append(doc_ngrams)
        vocab.update(doc_ngrams)

    # Create a mapping from n-grams to indices
    feature_names = sorted(vocab.keys())
    feature_indices = {ngram: idx for idx, ngram in enumerate(feature_names)}

    # Build the sparse matrix
    rows, cols, data = [], [], []
    for row_idx, doc_ngrams in enumerate(ngram_list_per_doc):
        ngram_counts = Counter(doc_ngrams)
        for ngram, count in ngram_counts.items():
            if ngram in feature_indices:  # Ensure the n-gram exists in the vocabulary
                rows.append(row_idx)
                cols.append(feature_indices[ngram])
                data.append(count)

    # Create a sparse matrix
    X = csr_matrix((data, (rows, cols)), shape=(len(corpus), len(feature_names)), dtype=int)
    return X, feature_names

# Example Usage
corpus = df['content'].fillna('').tolist()  # Ensure no NaN values
X, feature_names = manual_count_vectorizer(corpus, ngram_range=(1, 1))  # Unigram example

print(f"Vocabulary size: {len(feature_names)}")
print(f"Sparse matrix shape: {X.shape}")


Vocabulary size: 50921
Sparse matrix shape: (5836, 50921)


In [11]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import csr_matrix
from collections import Counter

class SparseMultiLabelKNN:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        """
        Store the training data and labels.
        """
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        """
        Predict labels for the given test data using sparse matrix operations.
        """
        # Compute distances using sparse matrix operations
        distances = np.sqrt((-2 * X_test @ self.X_train.T).toarray() +
                            (self.X_train.multiply(self.X_train).sum(axis=1).T).A +
                            (X_test.multiply(X_test).sum(axis=1)).A)

        # Find indices of k nearest neighbors
        k_indices = np.argsort(distances, axis=1)[:, :self.k]

        # Gather labels of k nearest neighbors
        k_labels = np.array([self.y_train[indices] for indices in k_indices])

        # Perform majority voting for each test sample
        predictions = np.array([self._majority_vote(neighbors) for neighbors in k_labels])
        return predictions

    def _majority_vote(self, k_labels):
        """
        Perform majority voting for multi-label classification.
        """
        label_sums = np.sum(k_labels, axis=0)  # Sum binary values across k neighbors
        return (label_sums >= (self.k / 2)).astype(int)  # Threshold for majority vote

# Data Preparation
df['gold_label'] = df['gold_label'].str.split(',')  # Convert to list of labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['gold_label'])


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and Evaluate the Optimized KNN Classifier
knn = SparseMultiLabelKNN(k=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Evaluate Performance
print("KNN Classifier Accuracy with k=3:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


KNN Classifier Accuracy with k=3: 0.761986301369863

Classification Report:
                    precision    recall  f1-score   support

          Business       0.87      0.71      0.78       189
     Entertainment       0.92      0.76      0.83       235
     International       0.88      0.55      0.68       215
Science-Technology       0.72      0.85      0.78       250
            Sports       0.95      0.88      0.91       279

         micro avg       0.86      0.76      0.81      1168
         macro avg       0.87      0.75      0.80      1168
      weighted avg       0.87      0.76      0.80      1168
       samples avg       0.76      0.76      0.76      1168



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
