In [17]:
import pandas as pd
df = pd.read_csv('../data/khabaronline-recrawl.csv')
df = df.dropna()

In [18]:
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=torch.device('mps'))

In [19]:
import concurrent.futures
import multiprocessing
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np

def calculate_similarity(labels, texts, label_embeddings=None, text_embeddings=None):
    if label_embeddings is None:
        label_embeddings = model.encode(labels,show_progress_bar=True)
    if text_embeddings is None:
        text_embeddings = model.encode(texts,True)
    scores = {}
    # return a dict with each label and its probability score
    # Something like this: [{'label1': 0.8, 'label2': 0.2, ...}, ...]
    with tqdm(total=len(texts), desc="Calculating similarity") as pbar:
        for i, text in enumerate(texts):
            text_embedding = text_embeddings[i].reshape(1, -1)
            similarity = cosine_similarity(text_embedding, label_embeddings)
            scores[text] = {}
            for j, label in enumerate(labels):
                scores[text][label] = similarity[0][j]
            pbar.update(1)
    return scores

def classify(labels, texts, label_embeddings=None, text_embeddings=None):
    scores = calculate_similarity(labels, texts, label_embeddings, text_embeddings)
    highest_labels = []
    for text in texts:
        highest_label = max(scores[text], key=scores[text].get)
        highest_labels.append(highest_label)
    return highest_labels
    

Topics

In [4]:
services = ['سیاسی','اقتصادی','اجتماعی']
# filter service column
df = df[df['service'].isin(['اخبار سیاسی','اخبار اقتصادی','اخبار اجتماعی'])]
# remove row if body column doesn't have persian characters
df = df[df['abstract'].str.contains(r'[\u0600-\u06FF]+')]
# Get rid of a certain term from all of the values of service column
df = df.replace({'service': {'اخبار ': ''}}, regex=True)

In [5]:
labels = ['سیاسی','اقتصادی','اجتماعی']
labels_sentence = ['خبری درباره سیاست','خبری درباره اقتصاد','خبری درباره اجتماع']
labels_embeddings = model.encode(labels, show_progress_bar=True)
labels_sentence_embeddings = model.encode(labels_sentence, show_progress_bar=True)
body_embeddings = model.encode(df['body'].values, show_progress_bar=True)
df['service_category'] = classify(labels, df['body'].values, labels_embeddings, body_embeddings)
df['service_category_sent'] = classify(labels_sentence, df['body'].values, labels_sentence_embeddings, body_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/430 [00:00<?, ?it/s]

In [10]:
service_map = {'خبری درباره سیاست': 'سیاسی', 'خبری درباره اقتصاد': 'اقتصادی', 'خبری درباره اجتماع': 'اجتماعی'}
df['service_category_sent'] = df['service_category_sent'].map(service_map)

In [16]:
# confusion matrix plot
y_test = df['service']

from sklearn.metrics import classification_report

print(classification_report(y_test, df['service_category_sent'], target_names=labels))
print(classification_report(y_test, df['service_category'], target_names=labels))

              precision    recall  f1-score   support

       سیاسی       0.35      0.58      0.44      2343
     اقتصادی       0.82      0.83      0.83      4462
     اجتماعی       0.81      0.63      0.71      6943

    accuracy                           0.69     13748
   macro avg       0.66      0.68      0.66     13748
weighted avg       0.74      0.69      0.70     13748

              precision    recall  f1-score   support

       سیاسی       0.44      0.31      0.37      2343
     اقتصادی       0.87      0.78      0.82      4462
     اجتماعی       0.74      0.86      0.79      6943

    accuracy                           0.74     13748
   macro avg       0.68      0.65      0.66     13748
weighted avg       0.73      0.74      0.73     13748

