In [1]:
!pip install pandas numpy matplotlib seaborn nltk tqdm datasets scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.57.0-cp312

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from tqdm import tqdm
from datasets import load_dataset

In [3]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Load dataset

In [4]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Load dataset
dataset = load_dataset("chcaa/periphery-aviser-e5", split='train')

df = dataset.to_pandas()
df.shape

README.md:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

train-00000-of-00016.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00001-of-00016.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00002-of-00016.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00003-of-00016.parquet:   0%|          | 0.00/400M [00:00<?, ?B/s]

train-00004-of-00016.parquet:   0%|          | 0.00/403M [00:00<?, ?B/s]

train-00005-of-00016.parquet:   0%|          | 0.00/396M [00:00<?, ?B/s]

train-00006-of-00016.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

train-00007-of-00016.parquet:   0%|          | 0.00/405M [00:00<?, ?B/s]

train-00008-of-00016.parquet:   0%|          | 0.00/398M [00:00<?, ?B/s]

train-00009-of-00016.parquet:   0%|          | 0.00/388M [00:00<?, ?B/s]

train-00010-of-00016.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00011-of-00016.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00012-of-00016.parquet:   0%|          | 0.00/395M [00:00<?, ?B/s]

train-00013-of-00016.parquet:   0%|          | 0.00/389M [00:00<?, ?B/s]

train-00014-of-00016.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00015-of-00016.parquet:   0%|          | 0.00/395M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/866977 [00:00<?, ? examples/s]

(866977, 13)

### Load annotated books subset

In [6]:
book_gold = pd.read_csv('../../newspaper_temp_files/annotated_books_gold.csv', index_col=0)
book_gold.head()

Unnamed: 0,article_id,book_announce,comment
264863,ode_023787,y,
151177,aal_066100,n,mixed
379019,ode_138229,n,
483748,thi_040569,n,mixed
369240,ode_128401,n,


In [7]:
df = df.merge(book_gold, on='article_id', how='left')
df['book_announce'] = df['book_announce'].fillna('unknown')

In [8]:
df.groupby('book_announce')['book_announce'].count()

book_announce
n             546
unknown    866177
y             254
Name: book_announce, dtype: int64

### Classifier with unlabeled announcements and book announcements

In [None]:
# Create sample of unlabeled announcements
unlabeled_df = df[(df['book_announce'] == 'unknown') & (df['clean_category'] == 'Bekjendtgjørelser')]
random_sample_unlabeled = unlabeled_df.sample(n=230, random_state=42)

# Create sample of labeled book announcements
random_sample_books = df[df['book_announce'] == 'y'].sample(n=230, random_state=42)

merged_sample = pd.concat([random_sample_unlabeled, random_sample_books])
merged_sample.shape

In [23]:
# Split the balanced data into train and test sets with stratification
train_df, test_df = train_test_split(
    merged_sample, 
    test_size=0.3, 
    random_state=42, 
    stratify=merged_sample['book_announce']
)

# Prepare training and test features/labels
X_train = np.vstack(train_df['embedding'].values)
y_train = train_df['book_announce'].values

X_test = np.vstack(test_df['embedding'].values)
y_test = test_df['book_announce'].values

# Instantiate the Logistic Regression classifier
clf_embs = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the labeled training data
print(f'Train classifier on embeddings')
clf_embs.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_embs.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on embeddings
              precision    recall  f1-score   support

     unknown       0.98      0.93      0.96        69
           y       0.93      0.99      0.96        69

    accuracy                           0.96       138
   macro avg       0.96      0.96      0.96       138
weighted avg       0.96      0.96      0.96       138



### Classifier with non-book announcements and book announcements

In [9]:
# Define the number of samples per class (adjust based on dataset size)
n_samples_per_class = 254  # Change as needed

df_books = df[df['book_announce'] != 'unknown']

# Create a balanced dataset by sampling an equal number of instances per class
df_balanced = df_books.groupby('book_announce', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), n_samples_per_class), random_state=42)
)

# Split the balanced data into train and test sets with stratification
train_df, test_df = train_test_split(
    df_balanced, 
    test_size=0.3, 
    random_state=42, 
    stratify=df_balanced['book_announce']
)

  df_balanced = df_books.groupby('book_announce', group_keys=False).apply(


In [10]:
# Prepare training and test features/labels
X_train = np.vstack(train_df['embedding'].values)
y_train = train_df['book_announce'].values

X_test = np.vstack(test_df['embedding'].values)
y_test = test_df['book_announce'].values

# Instantiate the Logistic Regression classifier
clf_embs = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the labeled training data
print(f'Train classifier on embeddings')
clf_embs.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_embs.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on embeddings
              precision    recall  f1-score   support

           n       0.85      0.87      0.86        77
           y       0.86      0.84      0.85        76

    accuracy                           0.86       153
   macro avg       0.86      0.86      0.86       153
weighted avg       0.86      0.86      0.86       153



In [11]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit on training data and transform both train and test sets
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Prepare labels
y_train = train_df['book_announce'].values
y_test = test_df['book_announce'].values

# Instantiate the Logistic Regression classifier
clf_tfidf = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

# Train the classifier on the TF-IDF features
print(f'Train classifier on TF-IDF features')
clf_tfidf.fit(X_train, y_train)

# Evaluate on the test set
predictions = clf_tfidf.predict(X_test)
print(classification_report(y_test, predictions))

Train classifier on TF-IDF features
              precision    recall  f1-score   support

           n       0.89      0.81      0.84        77
           y       0.82      0.89      0.86        76

    accuracy                           0.85       153
   macro avg       0.85      0.85      0.85       153
weighted avg       0.85      0.85      0.85       153



### Label unlabeled articles as book announcement

In [13]:
# Only get the announcements that have not been categorized as book anouncement yet
pred_df = df[(df['book_announce'] == 'unknown') & (df['clean_category'] == 'Bekjendtgjørelser')]
print(pred_df.shape)

# Identify the top 188 most frequent 'text' values (appearing 5 times or more)
top_188_texts = pred_df.groupby('text')['text'].count().sort_values(ascending=False).head(188).index

# Remove rows in 'pred_df' where the 'text' is in the top 188 texts
pred_df = pred_df[~pred_df['text'].isin(top_188_texts)]

# Remove lottery results
pred_df = pred_df[~pred_df['text'].str.contains(r'Ved Tallotteriets \S+ Trækning i', regex=True)]

# Remove very long and short articles
pred_df = pred_df[(pred_df['characters'] >= 70) & (pred_df['characters'] <= 500)]
print(pred_df.shape)

(336210, 15)
(182237, 15)


In [14]:
X_test_embs = np.vstack(pred_df['embedding'].values)

pred_df['predicted_book_announce'] = clf_embs.predict(X_test_embs)

In [15]:
pred_df['predicted_book_announce'].value_counts()

predicted_book_announce
n    167891
y     14346
Name: count, dtype: int64

In [16]:
df['book_announce'].value_counts()

book_announce
unknown    866177
n             546
y             254
Name: count, dtype: int64

In [17]:
book_announces = pd.concat([pred_df[pred_df['predicted_book_announce'] == 'y'], df[df['book_announce'] == 'y']])
book_announces.shape

(14600, 16)

In [18]:
book_announces.to_csv('../data/book_announces_250422.csv')