In [1]:
!pip install datasets matplotlib pandas numpy seaborn scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting numpy
  Downloading numpy-2.2.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=

In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
newspapers = ['lf_all', 'aal_all', 'od_all_clean', 'thi_all', 'vib_all']
path_root = '../../../DATA/NEWSPAPERS/article_embs/embeddings_e5/'

## Part I: finding the best classifier, predicting unlabeled data for manual evaluation

In [5]:
ds = Dataset.load_from_disk(f'{path_root}{newspapers[0]}')
df = ds.to_pandas()

df.columns

Index(['article_id', 'date', 'embedding', 'n_chunks_orig', 'clean_category',
       'nøgle', 'text', 'category', 'article_length', 'characters'],
      dtype='object')

In [7]:
df.shape

(85489, 10)

In [6]:
# Check the shape of each embedding
df['embedding_shape'] = df['embedding'].apply(lambda x: np.array(x).shape)
expected_dim = df['embedding_shape'].max()[0]
df = df[df['embedding'].apply(lambda x: np.array(x).shape == (expected_dim,))].copy()
df.shape

(85092, 11)

In [9]:
df.groupby('clean_category')['clean_category'].count()

clean_category
Bekjendtgjørelser    35817
Jndenlandsk          25184
Udenlandsk           24091
Name: clean_category, dtype: int64

In [10]:
# Define the number of samples per class (adjust based on dataset size)
n_samples_per_class = 24000  # Change as needed

# Create a balanced dataset by sampling an equal number of instances per class
df_balanced = df.groupby('clean_category', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), n_samples_per_class), random_state=42)
)

# Split the balanced data into train and test sets with stratification
train_df, test_df = train_test_split(
    df_balanced, 
    test_size=0.2, 
    random_state=42, 
    stratify=df_balanced['clean_category']
)

  df_balanced = df.groupby('clean_category', group_keys=False).apply(


### Embeddings as features

In [11]:
# Prepare training and test features/labels
X_train = np.vstack(train_df['embedding'].values)
y_train = train_df['clean_category'].values

X_test = np.vstack(test_df['embedding'].values)
y_test = test_df['clean_category'].values

# Instantiate the Logistic Regression classifier
clf_embs = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the labeled training data
print(f'Train classifier on embeddings')
clf_embs.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_embs.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on embeddings
                   precision    recall  f1-score   support

Bekjendtgjørelser       0.91      0.94      0.93      4800
      Jndenlandsk       0.84      0.85      0.84      4800
       Udenlandsk       0.91      0.88      0.89      4800

         accuracy                           0.89     14400
        macro avg       0.89      0.89      0.89     14400
     weighted avg       0.89      0.89      0.89     14400



In [None]:
# Prepare training and test features/labels
X_train = np.vstack(train_df['embedding'].values)
y_train = train_df['clean_category'].values

X_test = np.vstack(test_df['embedding'].values)
y_test = test_df['clean_category'].values

# Instantiate the Logistic Regression classifier
clf_embs = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the labeled training data
print(f'Train classifier on embeddings')
clf_embs.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_embs.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on embeddings
                   precision    recall  f1-score   support

Bekjendtgjørelser       0.92      0.95      0.94      4800
      Jndenlandsk       0.85      0.85      0.85      4800
       Udenlandsk       0.90      0.87      0.88      4800

         accuracy                           0.89     14400
        macro avg       0.89      0.89      0.89     14400
     weighted avg       0.89      0.89      0.89     14400



### TF-IDF

In [38]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit on training data and transform both train and test sets
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Prepare labels
y_train = train_df['clean_category'].values
y_test = test_df['clean_category'].values

# Instantiate the Logistic Regression classifier
clf_tfidf = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the TF-IDF features
print(f'Train classifier on TF-IDF features')
clf_tfidf.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_tfidf.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on TF-IDF features
                   precision    recall  f1-score   support

Bekjendtgjørelser       0.94      0.96      0.95      4800
      Jndenlandsk       0.88      0.86      0.87      4800
       Udenlandsk       0.90      0.91      0.91      4800

         accuracy                           0.91     14400
        macro avg       0.91      0.91      0.91     14400
     weighted avg       0.91      0.91      0.91     14400



### Get unlabeled articles

In [None]:
# Create final_df with all unlabeled articles
dfs = []

for i in range(1,5):
    ds = Dataset.load_from_disk(f'{path_root}{newspapers[i]}')
    df_np = ds.to_pandas()
    dfs.append(df_np)

final_df = pd.concat(dfs, ignore_index=True)

# Check the shape of each embedding
final_df['embedding_shape'] = final_df['embedding'].apply(lambda x: np.array(x).shape)
expected_dim = final_df['embedding_shape'].max()[0]
final_df = final_df[final_df['embedding'].apply(lambda x: np.array(x).shape == (expected_dim,))].copy()

# Add column with name newspaper
final_df['newspaper'] = final_df['article_id'].str.extract(r'^(.*?)_')

final_df.shape

(599929, 11)

In [None]:
# Create a balanced sample for testing and manually evaluating
df_sampled = final_df.groupby('newspaper', group_keys=False).apply(lambda x: x.sample(n=200, random_state=42))

  df_sampled = final_df.groupby('newspaper', group_keys=False).apply(lambda x: x.sample(n=200, random_state=42))


In [45]:
# Transform the new unlabeled data using the trained TF-IDF vectorizer
X_test_tfidf = vectorizer.transform(df_sampled['text'])

# Predict categories for the new data based on tf_idf
df_sampled['predicted_category_tf_idf'] = clf_tfidf.predict(X_test_tfidf)

X_test_embs = np.vstack(df_sampled['embedding'].values)

# Predict categories for the new data based on embeddings
df_sampled['predicted_category_embs'] = clf_embs.predict(X_test_embs)

# Save results
df_sampled[['article_id', 'date', 'predicted_category_tf_idf', 'predicted_category_embs', 'category', 'text']].to_csv('../results/predicted_sample_tfidf_embeddings.csv')

In [46]:
df_disagree = df_sampled[df_sampled['predicted_category_embs'] != df_sampled['predicted_category_tf_idf']]

In [47]:
df_disagree.shape

(117, 14)

## Part II: using the best classifier to predict all unlabeled articles

In [4]:
# Create final_df with all unlabeled articles
dfs = []

for i in range(0,5):
    ds = Dataset.load_from_disk(f'{path_root}{newspapers[i]}')
    df_np = ds.to_pandas()
    dfs.append(df_np)

final_df = pd.concat(dfs, ignore_index=True)

# Check the shape of each embedding
final_df['embedding_shape'] = final_df['embedding'].apply(lambda x: np.array(x).shape)
expected_dim = final_df['embedding_shape'].max()[0]
final_df = final_df[final_df['embedding'].apply(lambda x: np.array(x).shape == (expected_dim,))].copy()

# Add column with name newspaper
final_df['newspaper'] = final_df['article_id'].str.extract(r'^(.*?)_')

final_df.shape

(685021, 12)

In [15]:
final_df.head()

Unnamed: 0,article_id,date,embedding,n_chunks_orig,clean_category,nøgle,text,category,article_length,characters,embedding_shape,newspaper
0,lol_000001,1809-03-07,"[0.015812713303603232, 0.0059686582535505295, ...",4,Jndenlandsk,1809-03-07_5,Jndenlandsk. Helsingøer den 26de Februar. J Fo...,Jndenlandsk,298,1592,"(1024,)",lol
1,lol_000002,1809-03-07,"[0.019437216222286224, 0.013088210485875607, -...",1,Jndenlandsk,1809-03-07_8,Kjøbenhavn den 27 Februarii. Kornpriserne i Kb...,Jndenlandsk,81,513,"(1024,)",lol
2,lol_000003,1809-03-07,"[0.006957924459129572, 0.006066289730370045, -...",1,Jndenlandsk,1809-03-07_9,De danske Officerer af Linieskibet Prinds Chri...,Jndenlandsk,31,205,"(1024,)",lol
3,lol_000004,1809-03-07,"[0.044964227825403214, 0.006043573841452599, -...",1,Jndenlandsk,1809-03-07_10,Caffe koster nu paa det Vestindiske Compagnie ...,Jndenlandsk,48,257,"(1024,)",lol
4,lol_000005,1809-03-07,"[0.031310804188251495, 0.01853332109749317, -0...",1,Jndenlandsk,1809-03-07_11,"For kort Tid siden bleve de, der boe ved Veste...",Jndenlandsk,80,445,"(1024,)",lol


In [5]:
final_df.groupby('clean_category')['clean_category'].count()

clean_category
-1                   599929
Bekjendtgjørelser     35817
Jndenlandsk           25184
Udenlandsk            24091
Name: clean_category, dtype: int64

In [6]:
gold_set = pd.read_csv('../results/predicted_sample_tfidf_embeddings(1).csv', index_col=0, sep=';')
gold_set.head()

Unnamed: 0,article_id,date,predicted_category_tf_idf,evaluation_tf_idf,predicted_category_embs,evaluation_embs,category,true_label,bog_teater,text
55097,aal_055098,1826-08-17,Jndenlandsk,t,Jndenlandsk,t,Kjøbenhavn.,,,"J Nærheden heraf er opkommet en Jordbrand, som..."
142457,aal_142458,1840-12-05,Jndenlandsk,t,Bekjendtgjørelser,t,Bekjendtgjørelser,,,"Ved Auctionen, Torsdagen den 10de dennes, i St..."
70491,aal_070492,1829-03-04,Udenlandsk,t,Udenlandsk,t,Blandinger,,,"En Kok i Paris har taget Livet af sig, fordi m..."
146810,aal_146811,1841-08-10,Udenlandsk,t,Udenlandsk,t,Nyeste Post-Efterretninger,,,"Det hollandske Blad, Tolk der Vryheid, med hvi..."
155768,aal_155769,1842-12-07,Jndenlandsk,t,Jndenlandsk,t,Fædrelandet,,,"J Sagen: Postholder Prahl, for, imod ProviHarm..."


In [7]:
gold_set = gold_set.applymap(lambda x: x.strip() if isinstance(x, str) else x)

gold_set["true_label"] = gold_set.apply(
    lambda row: row["predicted_category_tf_idf"] if row["evaluation_tf_idf"] == "t" else
                (row["predicted_category_embs"] if row["evaluation_embs"] == "t" else row["true_label"]),
    axis=1
)

  gold_set = gold_set.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [8]:
# Merge final_df with df_gold_set on 'article_id' to get updated values
final_df = final_df.merge(gold_set[['article_id', 'true_label']], on='article_id', how='left')

# Overwrite 'clean_category' with 'true_label' where available
final_df['clean_category'] = final_df['true_label'].fillna(final_df['clean_category'])

# Drop the temporary 'true_label' column
final_df.drop(columns=['true_label'], inplace=True)

In [9]:
final_df['label_type'] = final_df['clean_category'].apply(lambda x: 'predicted' if x == -1 else 'gold')

In [10]:
training_df = final_df[final_df['label_type'] == 'gold']
training_df.shape

(85882, 13)

In [11]:
# Identify the subset where 'newspaper' == 'lol'
lol_df = training_df[training_df['newspaper'] == 'lol']

# Sample 200 random examples for each value of 'clean_category'
lol_balanced = lol_df.groupby('clean_category', group_keys=False).apply(
    lambda x: x.sample(n=400, replace=True) if len(x) >= 400 else x
).reset_index(drop=True)

# Keep the other newspapers as they are
other_newspapers_df = training_df[training_df['newspaper'] != 'lol']

# Combine the balanced 'lol' subset with the other newspapers
filtered_df = pd.concat([lol_balanced, other_newspapers_df], ignore_index=True)

  lol_balanced = lol_df.groupby('clean_category', group_keys=False).apply(


### Train final classifier

In [12]:
# Split the balanced data into train and test sets with stratification
train_df, test_df = train_test_split(
    filtered_df, 
    test_size=0.2, 
    random_state=42, 
    stratify=filtered_df['clean_category']
)

In [13]:
# Prepare training and test features/labels
X_train = np.vstack(train_df['embedding'].values)
y_train = train_df['clean_category'].values

X_test = np.vstack(test_df['embedding'].values)
y_test = test_df['clean_category'].values

# Instantiate the Logistic Regression classifier
clf_embs = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the labeled training data
print(f'Train classifier on embeddings')
clf_embs.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_embs.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on embeddings


                   precision    recall  f1-score   support

Bekjendtgjørelser       0.86      0.88      0.87       136
      Jndenlandsk       0.75      0.76      0.75       136
       Udenlandsk       0.88      0.85      0.86       126

         accuracy                           0.83       398
        macro avg       0.83      0.83      0.83       398
     weighted avg       0.83      0.83      0.83       398



In [14]:
pred_df = final_df[final_df['label_type'] == 'predicted']
pred_df.shape

(599139, 13)

In [15]:
X_test_embs = np.vstack(pred_df['embedding'].values)

# Predict categories for the unlabeled articles based on embeddings
pred_df['predicted_category_embs'] = clf_embs.predict(X_test_embs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['predicted_category_embs'] = clf_embs.predict(X_test_embs)


In [16]:
# Merge final_df with pred_df on 'article_id' to get updated values
final_df = final_df.merge(pred_df[['article_id', 'predicted_category_embs']], on='article_id', how='left')

# Overwrite 'clean_category' with 'predicted_category_embs' where available
final_df['clean_category'] = final_df['predicted_category_embs'].fillna(final_df['clean_category'])

# Drop the temporary 'true_label' column
final_df.drop(columns=['predicted_category_embs'], inplace=True)

In [17]:
final_df.shape

(685021, 13)

### Save articles and their category as parquet

In [18]:
final_df.to_parquet("../../../DATA/NEWSPAPERS/final_df_e5.parquet", engine="pyarrow", index=False)

In [None]:
# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(final_df, preserve_index=False)

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67e12a5d-0752c99e1acd1f3e2dae440f;78524486-b840-42f6-9082-e49be6cfbc3d)

Invalid username or password.

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
# Push to Hugging Face as a dataset
dataset.push_to_hub("awlassche/periphery-aviser-e5", private=True)

Uploading the dataset shards:   0%|          | 0/13 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/awlassche/periphery-aviser-e5/commit/6e8115f1251e777d3d9bed3967834ec3171aed4c', commit_message='Upload dataset', commit_description='', oid='6e8115f1251e777d3d9bed3967834ec3171aed4c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/awlassche/periphery-aviser-e5', endpoint='https://huggingface.co', repo_type='dataset', repo_id='awlassche/periphery-aviser-e5'), pr_revision=None, pr_num=None)