In [23]:
import pandas as pd
import numpy as np
import random
import json

In [None]:
topics = pd.read_csv("../../../3_stm_fit_R/intrusion_words.csv")

In [25]:
topics.head(2)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7
0,headline,dan,edit,column,read,sunday,david
1,original,june,date,space,new,,


In [26]:
topic_exclusions = {
        "gender": [3, 35, 43, 51],
        "race": [24, 31],
        "debt_housing": [14, 41, 61],
        "public_employment": [9, 30],
        "health_insurance_&disparities": [8, 16, 49],
        "militar_conflicts": [15, 44, 53, 59]
    }

In [27]:
def process_topics(topics_df, exclude_topics=None):
    # Add topic numbers starting from 1
    topics_df = topics_df.copy()
    topics_df["topic_n"] = range(1, len(topics_df) + 1)

    # Exclude specified topics
    if exclude_topics:
        topics_df = topics_df[~topics_df["topic_n"].isin(exclude_topics)]

    # Create a new column with a list of words from the first 5 columns
    topics_df["topic_wlist"] = topics_df.iloc[:, :5].apply(lambda row: list(row), axis=1)

    # Get a set of all words in the dataset
    all_words = set(topics_df.iloc[:, :5].values.flatten())

    # Function to add an intruder word and its position
    def add_intruder(row, all_words):
        topic_num = row["topic_n"]

        # Find words to exclude based on the topic's category
        exclude_words = set()
        for category, topics in topic_exclusions.items():
            if topic_num in topics:
                exclude_words.update(topics_df.loc[topics_df["topic_n"].isin(topics), "topic_wlist"].explode().tolist())
                print(category, exclude_words)

        # Ensure the intruder word is from a different topic and not in the exclusion list
        possible_intruders = list(all_words - set(row["topic_wlist"]) - exclude_words)
        print("possible_intruders: ", len(possible_intruders))

        intruder_word = random.choice(possible_intruders)

        # Add the intruder and shuffle
        intrusion_task = row["topic_wlist"] + [intruder_word]
        random.shuffle(intrusion_task)

        # Find the position of the intruder word (1-based index)
        intruder_pos = intrusion_task.index(intruder_word) + 1
        return intruder_word, intrusion_task, intruder_pos

    # Apply the function and unpack results into three new columns
    topics_df[["intruder_word", "intrusion_task", "intruder_pos"]] = topics_df.apply(
        lambda row: pd.Series(add_intruder(row, all_words)), axis=1
    )

    return topics_df


In [28]:
boilerplate =  [1, 2, 5, 11, 21, 23, 32, 33, 36, 39, 40, 46, 48, 50, 55, 57, 58, 64, 66, 67, 69]
stm_intrusion_task = process_topics(topics, exclude_topics = boilerplate)

gender {'rape', 'equal', 'ledbetter', 'occupational', 'amendment', 'breadwinner', 'ratify', 'occupation', 'pregnant', 'mom', 'workplace', 'constitutional', 'comparable', 'constitution', 'breast', 'profession', 'differential', 'equality', 'pregnancy', 'discrimination'}
possible_intruders:  217
possible_intruders:  232
possible_intruders:  232
possible_intruders:  232
health_insurance_&disparities {'affordable', 'physician', 'medicare', 'patient', 'medication', 'deductible', 'care', 'uninsured', 'hospital', 'drug', 'health', 'obamacare', 'reimbursement', 'prescription', 'provider'}
possible_intruders:  222
public_employment {'compensation', 'readiness', 'morale', 'employee', 'services', 'civilian', 'pentagon', 'personnel', 'employees', 'overtime'}
possible_intruders:  227
possible_intruders:  232
possible_intruders:  232
possible_intruders:  232
debt_housing {'lending', 'locality', 'tenant', 'loan', 'borrower', 'foreclosure', 'rental', 'funding', 'assistance', 'nonprofit', 'homeowner', '

In [29]:
stm_intrusion_task.head(2)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,topic_n,topic_wlist,intruder_word,intrusion_task,intruder_pos
2,ledbetter,breadwinner,discrimination,workplace,equal,gender,discriminatory,3,"[ledbetter, breadwinner, discrimination, workp...",trade,"[discrimination, breadwinner, equal, trade, wo...",4
3,communism,faction,liberty,authoritarian,revolution,welfare,centrist,4,"[communism, faction, liberty, authoritarian, r...",ward,"[communism, ward, liberty, authoritarian, revo...",2


In [30]:
n = 6
print(stm_intrusion_task["intrusion_task"][n])
print(stm_intrusion_task["intruder_word"][n])

['policing', 'peaceful', 'detain', 'greece', 'patrol', 'police']
greece


In [31]:
#save processed_df as pandas df
stm_intrusion_task.to_csv('stm_intrusion_task.csv')  

#### Dynamic-lables .json

In [32]:
def create_json_from_df(topics_df, filename="stm_intrusion_items.jsonl"):
    # Ensure the necessary columns exist
    if "topic_n" not in topics_df or "intrusion_task" not in topics_df:
        raise ValueError("DataFrame must contain 'topic_n' and 'intrusion_task' columns")

    # Open the file in write mode
    with open(filename, 'w') as f:
        for _, row in topics_df.iterrows():
            data = {
                "id": str(row["topic_n"]),
                "text": " - ".join(row["intrusion_task"]),
                "labels": row["intrusion_task"]
            }
            # Write each dictionary as a line
            json.dump(data, f)
            f.write("\n")

    print(f"JSON file saved to {filename}")

In [33]:
create_json_from_df(stm_intrusion_task, "stm_intrusion_items.jsonl")

JSON file saved to stm_intrusion_items.jsonl
