In [1]:
import pandas as pd

In [2]:
# Load CSV files
df1 = pd.read_csv("annotated_instances1.csv")
df2 = pd.read_csv("annotated_instances2.csv")

# Concatenate them
annotated_instances = pd.concat([df1, df2], ignore_index=True)

In [3]:
annotated_instances.head()

Unnamed: 0,user,instance_id,displayed_text,intruder:::childhood,intruder:::tariff,intruder:::vaccine,intruder:::trade,intruder:::hatred,intruder:::revolution,intruder:::uncle,...,intruder:::entrepreneurial,intruder:::earner,intruder:::locality,intruder:::readiness,intruder:::wealth,intruder:::faction,intruder:::alliance,intruder:::comparable,intruder:::affirmative,intruder:::adapt
0,5dcf0f73af16d3067b87757f,25,summers - productivity - inflation - communism...,,,,,,,,...,,,,,,,,,,
1,5dcf0f73af16d3067b87757f,14,teacher - voucher - funding - locality - nonpr...,,,,,,,,...,,,,,,,,,,
2,5dcf0f73af16d3067b87757f,60,solar - fossil - carbon - emission - greenhous...,,,,,,,,...,,,,,,,,,,
3,5dcf0f73af16d3067b87757f,31,entrepreneurship - apartheid - africa - racist...,,,,,,,,...,,,,,,,,,,
4,5dcf0f73af16d3067b87757f,34,prescription - outreach - alliance - library -...,,,,,,,,...,,,,,,,,,,


In [4]:
# Keep only relevant columns
columns_to_keep = ["user", "instance_id", "displayed_text"]

#Just keep the additional column that has content
# Create 'answer' column with the first non-empty intruder column value
intruder_columns = [col for col in annotated_instances.columns if col.startswith("intruder:::")]
annotated_instances["answer"] = annotated_instances[intruder_columns].bfill(axis=1).iloc[:, 0]

# Keep only required columns
annotated_instances_clean = annotated_instances[columns_to_keep + ["answer"]]

In [5]:
#Erase non intruder questions
annotated_instances_clean = annotated_instances_clean[~annotated_instances_clean["instance_id"].astype(str).str.startswith("Word")]
annotated_instances_clean.head()

Unnamed: 0,user,instance_id,displayed_text,answer
0,5dcf0f73af16d3067b87757f,25,summers - productivity - inflation - communism...,1.0
1,5dcf0f73af16d3067b87757f,14,teacher - voucher - funding - locality - nonpr...,5.0
2,5dcf0f73af16d3067b87757f,60,solar - fossil - carbon - emission - greenhous...,1.0
3,5dcf0f73af16d3067b87757f,31,entrepreneurship - apartheid - africa - racist...,1.0
4,5dcf0f73af16d3067b87757f,34,prescription - outreach - alliance - library -...,1.0


Erase users with problems

In [6]:
# Load users_not.txt
users_not_df = pd.read_csv("users_not.txt")

# Remove users from the dataset
annotated_instances_clean = annotated_instances_clean[~annotated_instances_clean["user"].isin(users_not_df["user"])]

In [7]:
# Count total unique users after filtering
total_users = annotated_instances_clean["user"].nunique()
print(f"Total unique users: {total_users}")

Total unique users: 18


In [8]:
#Check that participants have full answers
len(annotated_instances_clean)/52

18.0

Attention check

In [9]:
# Define correct answers for attention check instance_ids
attention_checks = {
    "0_testing": 1,
    "1_testing": 2,
    "2_testing": 5
}

In [10]:
# Filter for only attention check rows
df_attention = annotated_instances_clean[annotated_instances_clean["instance_id"].isin(attention_checks.keys())]

df_attention["answer"] = df_attention["answer"].astype(int)
df_pivot = df_attention.pivot(index="user", columns="instance_id", values="answer")
df_pivot = df_pivot.dropna()

# Check if each user answered correctly
passed_users = df_pivot[
    (df_pivot["0_testing"] == attention_checks["0_testing"]) &
    (df_pivot["1_testing"] == attention_checks["1_testing"]) &
    (df_pivot["2_testing"] == attention_checks["2_testing"])
].index

# Count users who passed all three attention checks
passed_count = len(passed_users)
print(f"Users who passed all 3 attention checks: {passed_count}")

Users who passed all 3 attention checks: 18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_attention["answer"] = df_attention["answer"].astype(int)


Calculate precision

In [11]:
#Import solutions
intrusion_solutions = pd.read_csv("stm_intrusion_task.csv")
intrusion_solutions["intruder_pos"] = intrusion_solutions["intruder_pos"].astype(int)
intrusion_solutions["topic_n"] = intrusion_solutions["topic_n"].astype(str)

In [12]:
# Create a dictionary mapping instance_id to correct intruder_pos
correct_answers = dict(zip(intrusion_solutions["topic_n"], intrusion_solutions["intruder_pos"]))

In [13]:
#Erase attention checks
annotated_precision = annotated_instances_clean[~annotated_instances_clean["instance_id"].isin(attention_checks.keys())]

In [14]:
annotated_precision.head()

Unnamed: 0,user,instance_id,displayed_text,answer
0,5dcf0f73af16d3067b87757f,25,summers - productivity - inflation - communism...,1.0
1,5dcf0f73af16d3067b87757f,14,teacher - voucher - funding - locality - nonpr...,5.0
2,5dcf0f73af16d3067b87757f,60,solar - fossil - carbon - emission - greenhous...,1.0
3,5dcf0f73af16d3067b87757f,31,entrepreneurship - apartheid - africa - racist...,1.0
4,5dcf0f73af16d3067b87757f,34,prescription - outreach - alliance - library -...,1.0


In [15]:
# Add a column "correct" (1 if answer matches, 0 otherwise)
annotated_precision["correct_answer"] = annotated_precision["instance_id"].map(correct_answers)
annotated_precision["check"] = annotated_precision["instance_id"].map(correct_answers) == annotated_precision["answer"]
annotated_precision["check"] = annotated_precision["check"].astype(int)

# Compute precision
precision = annotated_precision["check"].mean()

print(f"Model Precision: {precision:.2f}")  # Example output: 0.75 (75%)

Model Precision: 0.68


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotated_precision["correct_answer"] = annotated_precision["instance_id"].map(correct_answers)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotated_precision["check"] = annotated_precision["instance_id"].map(correct_answers) == annotated_precision["answer"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [16]:
# Compute precision per topic
topic_precision = annotated_precision.groupby("instance_id")["check"].mean().reset_index()

# Sort by precision (ascending order)
topic_precision = topic_precision.sort_values(by="check", ascending=True)

# Rename columns for clarity
topic_precision.columns = ["topic_n", "topic_precision"]

In [17]:
topic_precision

Unnamed: 0,topic_n,topic_precision
22,37,0.0
35,54,0.055556
10,22,0.055556
17,3,0.111111
12,25,0.111111
18,30,0.166667
47,8,0.222222
34,53,0.333333
21,35,0.333333
11,24,0.333333
