Importing the required Python libraries. Specific attention to [Tensorflow](https://www.tensorflow.org/), the open source machine learning library we've used for applying machine learning to PQ headings.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


Our training corpus, comprising a dataset of PQ headings, Departments and PQ text strings.

In [None]:
data = pd.read_csv('/content/drive/MyDrive/machine_learning_tests/model_data/corpusForJS.csv')

Encoding labels for Departments and headings.

In [None]:
department_encoder = LabelEncoder()
heading_encoder = LabelEncoder()

data['department_encoded'] = department_encoder.fit_transform(data['department'])
data['heading_encoded'] = heading_encoder.fit_transform(data['heading'])


Tokenizing our question text strings.

In [None]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['question'])
sequences = tokenizer.texts_to_sequences(data['question'])
padded_sequences = pad_sequences(sequences, maxlen=300, padding='post', truncating='post')


We're importing our pre-trained model. Model trained using Tensorflow/[Keras](https://keras.io/).

In [None]:
import tensorflow as tf

model_path = '/content/drive/MyDrive/machine_learning_tests/heading_prediction_model_2025-01-01_120K_success.keras'
model = tf.keras.models.load_model(model_path)


  saveable.load_own_variables(weights_store.get(inner_path))


Checking our departmental index. In the training corpus, each Department is assigned a specific number and this is used in assigning predicted headings.

In [None]:
for index, department in enumerate(department_encoder.classes_):
    print(f"{index}: {department}")


0: Agriculture
1: Business
2: Children
3: Climate
4: Communications
5: Community
6: Culture
7: Defence
8: Education
9: Employment
10: Enterprise
11: Environment
12: Finance
13: Foreign
14: Further and Higher Education
15: Health
16: Housing
17: Justice
18: Media
19: Public
20: Public Expenditure
21: Rural
22: Social
23: Taoiseach
24: Tourism
25: Transport


OPTIONAL
Outputting our tokenizer and label encodings in reusable ["pickle"](https://docs.python.org/3/library/pickle.html) format.

In [None]:
import pickle

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save the label encoders
with open('department_encoder.pkl', 'wb') as f:
    pickle.dump(department_encoder, f)

with open('heading_encoder.pkl', 'wb') as f:
    pickle.dump(heading_encoder, f)


JSON is not the most popular way of dealing with data in Python but I've been using JavaScript and specifically d3 to wrangle the PQ data. So to apply the Department categories at scale I've been working in JSON.

In [None]:
import json

# Load the JSON file
file_path = '/content/drive/MyDrive/machine_learning_tests/ML_inputs/2025-03-06_inputs/departments_categorised.json'

with open(file_path, 'r') as file:
    department_data = json.load(file)

# Print the loaded JSON content
print(department_data)


[24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 7, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 20, 20, 20, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 24, 24, 24, 24, 24, 24, 24, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Same with the question text string data.

In [None]:
file_path = '/content/drive/MyDrive/machine_learning_tests/ML_inputs/2025-03-06_inputs/questions_text.json'

with open(file_path, 'r') as file:
    questions_data = json.load(file)

print(questions_data)


[' 1. Deputy Joanna Byrne asked the Minister for Tourism, Culture, Arts, Gaeltacht, Sport and Media the timeline for the replacement of the TV licence fee by direct funding from the Exchequer, managed by Coimisiún na Meán; and if he will make a statement on the matter. [10148/25] ', ' 2. Deputy Aidan Farrelly asked the Minister for Tourism, Culture, Arts, Gaeltacht, Sport and Media if he will provide further details in respect of his plans to review spending by bodies under his remit and-or aegis. [9994/25] ', ' 3. Deputy Joanna Byrne asked the Minister for Tourism, Culture, Arts, Gaeltacht, Sport and Media to give an update on and a timeline for the creation of football academies with the FAI and the League of Ireland, as contained in the programme for Government; and if he will make a statement on the matter. [10149/25] ', " 4. Deputy Roderic O'Gorman asked the Minister for Tourism, Culture, Arts, Gaeltacht, Sport and Media the number of artists who have benefited from the basic inco

So now we take the new or unseen question heading and question text data and make predictions for question headings with our pre-trained model. To make the workflow easier to handle, the JSON file will output to a directory with date labels.

In [None]:
import os
from datetime import datetime

new_questions = questions_data
new_departments = department_data


new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded = pad_sequences(new_sequences, maxlen=300, padding='post', truncating='post')

# Make predictions
predictions = model.predict([new_padded, np.array(new_departments)])
predicted_headings = heading_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Prepare results as a list of dictionaries
results = [
    {"question": question, "predicted_heading": heading}
    for question, heading in zip(new_questions, predicted_headings)
]

base_dir = '/content/drive/MyDrive/machine_learning_tests/predicted_headings'

today_date = datetime.now().strftime('%Y-%m-%d')

new_folder_path = os.path.join(base_dir, today_date)

os.makedirs(new_folder_path, exist_ok=True)

output_file = os.path.join(new_folder_path, today_date+'_predicted_headings.json')

with open(output_file, 'w') as f:
    json.dump(results, f, indent=4)

print(f"Predictions saved to {output_file}")


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Predictions saved to /content/drive/MyDrive/machine_learning_tests/predicted_headings/2025-03-12/2025-03-12_predicted_headings.json


Alternatively, outputting as a csv.

In [None]:
import os
from datetime import datetime

# Your input data
new_questions = questions_data
new_departments = department_data

# Preprocess the input data
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded = pad_sequences(new_sequences, maxlen=300, padding='post', truncating='post')

# Make predictions
predictions = model.predict([new_padded, np.array(new_departments)])
predicted_headings = heading_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Prepare results as a DataFrame
results_df = pd.DataFrame({
    "question": new_questions,
    "predicted_heading": predicted_headings
})

# Set up output directory and file name
base_dir = '/content/drive/MyDrive/machine_learning_tests/predicted_headings'
today_date = datetime.now().strftime('%Y-%m-%d')
new_folder_path = os.path.join(base_dir, today_date)
os.makedirs(new_folder_path, exist_ok=True)

output_file = os.path.join(new_folder_path, today_date+'_predicted_headings.csv')

# Save results as a CSV file
results_df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Predictions saved to /content/drive/MyDrive/machine_learning_tests/predicted_headings/2025-03-12/2025-03-12_predicted_headings.csv
