In [None]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from lxml import etree
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# === Paths ===
DATA_DIR = "/kaggle/input/make-data-count-finding-data-references"
TRAIN_LABELS = f"{DATA_DIR}/train_labels.csv"
TRAIN_DIR = f"{DATA_DIR}/train"
TEST_DIR = f"{DATA_DIR}/test"

# === Load Labels ===
labels = pd.read_csv(TRAIN_LABELS)

# === Parse XML ===
def parse_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        return ' '.join(tree.xpath('//text()'))
    except:
        return ""

# === Extract Text ===
def extract_text(row, base_dir):
    article_id = row['article_id']
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file == f"{article_id}.xml":
                xml_path = os.path.join(root, file)
                text = parse_xml(xml_path)
                return text if isinstance(text, str) and text.strip() else ""
    return ""

# === Filter for Available XMLs ===
available_xmls = set()
for root, _, files in os.walk(TRAIN_DIR):
    for file in files:
        if file.endswith(".xml"):
            available_xmls.add(file.replace(".xml", ""))

print(f"Available XML files: {len(available_xmls)}")
labels['article_id'] = labels['article_id'].astype(str)
labels = labels[labels['article_id'].isin(available_xmls)]
print(f"Found {len(labels)} articles with matching XMLs.")

# === Extract Text from XMLs ===
labels['text'] = labels.apply(lambda row: extract_text(row, TRAIN_DIR), axis=1)
labels.dropna(subset=['text'], inplace=True)
labels = labels[labels['text'].astype(str).str.strip().astype(bool)]
print(f"Remaining after text extraction: {len(labels)}")

# === Label Encoding ===
le = LabelEncoder()
labels['label'] = le.fit_transform(labels['type'])

# === Train-Val Split ===
X_train, X_val, y_train, y_val = train_test_split(labels['text'], labels['label'], test_size=0.2, random_state=42)

# === TF-IDF Vectorization ===
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# === Model ===
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# === Training ===
model.fit(X_train_tfidf.toarray(), y_train, epochs=5, batch_size=32, validation_data=(X_val_tfidf.toarray(), y_val))

# === Evaluation ===
y_pred = model.predict(X_val_tfidf.toarray())
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_val, y_pred_labels, target_names=le.classes_))

Available XML files: 400
Found 902 articles with matching XMLs.
Remaining after text extraction: 902


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - accuracy: 0.5262 - loss: 0.9959 - val_accuracy: 0.8398 - val_loss: 0.5840
Epoch 2/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9011 - loss: 0.4090 - val_accuracy: 0.8840 - val_loss: 0.2968
Epoch 3/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9816 - loss: 0.1167 - val_accuracy: 0.9282 - val_loss: 0.2467
Epoch 4/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9762 - loss: 0.0755 - val_accuracy: 0.9171 - val_loss: 0.2692
Epoch 5/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9894 - loss: 0.0550 - val_accuracy: 0.9227 - val_loss: 0.2488
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
              precision    recall  f1-score   support

     Missing       0.92      0.92      0.92        48
     Primary       

In [None]:
# === Prepare Test Data ===
test_articles = []
for root, _, files in os.walk(TEST_DIR):
    for file in files:
        if file.endswith(".xml"):
            test_articles.append(os.path.splitext(file)[0])
test_df = pd.DataFrame({'article_id': test_articles})
test_df['text'] = test_df.apply(lambda row: extract_text(row, TEST_DIR), axis=1)
test_df = test_df[test_df['text'].astype(str).str.strip().astype(bool)]

# === TF-IDF for Test ===
test_tfidf = vectorizer.transform(test_df['text'])
test_preds = model.predict(test_tfidf.toarray())
test_labels = le.inverse_transform(np.argmax(test_preds, axis=1))

# === Submission ===
test_df['type'] = test_labels
test_df['dataset_id'] = ['unknown'] * len(test_df)
test_df['row_id'] = range(len(test_df))

submission = test_df[['row_id', 'article_id', 'dataset_id', 'type']]
submission.to_csv("submission.csv", index=False)