# In this notebook we want to find negative examples to exclude 

### We start with LB

In [1]:
# Essential data analysis imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


# Define the data paths for both datasets
DATA_PATH_1 = "../data/SSOT_manual_LB_20250808_120908.csv"


# Load the first dataset (df1)
try:
    df_LB = pd.read_csv(DATA_PATH_1)
    print(f"✓ First dataset loaded successfully")
    print(f"✓ Shape of dataset 1: {df_LB.shape}")
except FileNotFoundError:
    print("❌ Error: The file 'ids.csv' was not found in the manual_data directory")
except Exception as e:
    print(f"❌ Error loading the first dataset: {str(e)}")


# Display basic information about both datasets
print("\nDataset 1 Info:\n")
print(df_LB.info())
print("\nFirst few rows of dataset 1:\n")
display(df_LB.head())


✓ First dataset loaded successfully
✓ Shape of dataset 1: (3944, 15)

Dataset 1 Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3944 entries, 0 to 3943
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            3944 non-null   object
 1   abstract      3931 non-null   object
 2   acmid         3944 non-null   object
 3   author        3944 non-null   object
 4   doi           3943 non-null   object
 5   outlet        3943 non-null   object
 6   title_full    3943 non-null   object
 7   url           3943 non-null   object
 8   year          3943 non-null   object
 9   qualtrics_id  3943 non-null   object
 10  wos_id        3940 non-null   object
 11  ebsco_id      226 non-null    object
 12  stage_1       3944 non-null   bool  
 13  stage_2       3944 non-null   bool  
 14  stage_3       3944 non-null   bool  
dtypes: bool(3), object(12)
memory usage: 381.4+ KB
None

First few rows of dataset 1:



Unnamed: 0,ID,abstract,acmid,author,doi,outlet,title_full,url,year,qualtrics_id,wos_id,ebsco_id,stage_1,stage_2,stage_3
0,Bindu2018503,Online social networks have become immensely p...,,"Bindu, P V and Mishra, R and Thilagam, P S",10.1007/s10844-017-0494-z,Journal of Intelligent Information Systems,{Discovering spammer communities in TWITTER},https://www.scopus.com/inward/record.uri?eid=2...,2018,12,,,True,False,False
1,Moraga2018470,This article explores the ways Latinos—as audi...,,"Moraga, J E",10.1177/0193723518797030,Journal of Sport and Social Issues,"{On ESPN Deportes: Latinos, Sport MEDIA, and t...",https://www.scopus.com/inward/record.uri?eid=2...,2018,22,,,True,False,False
2,Lanosga20181676,This study of American investigative reporting...,,"Lanosga, G and Martin, J",10.1177/1464884916683555,JOURNALISm,"{JOURNALISts, sources, and policy outcomes: In...",https://www.scopus.com/inward/record.uri?eid=2...,2018,47,,,True,False,True
3,Warner2018720,"In this study, we test the indirect and condit...",,"Warner, B R and Jennings, F J and Bramlett, J ...",10.1080/15205436.2018.1472283,Mass Communication and Society,{A MultiMEDIA Analysis of Persuasion in the 20...,https://www.scopus.com/inward/record.uri?eid=2...,2018,50,,,True,False,False
4,Burrows20181117,Professional communicators produce a diverse r...,,"Burrows, E",10.1177/0163443718764807,"MEDIA, Culture and Society",{Indigenous MEDIA producers' perspectives on o...,https://www.scopus.com/inward/record.uri?eid=2...,2018,56,,,True,False,False


In [5]:
# Get examples where stage_2 is False from LB dataset
import pandas as pd
import os

# Load the dataset
df_LB = pd.read_csv('../data/SSOT_manual_LB_20250808_120908.csv')

stage2_false_examples = df_LB[df_LB['stage_2'] == False]

# Get 10 random examples with their abstracts using correct column names
examples_lb = stage2_false_examples[['ID', 'title_full', 'abstract']].sample(n=10)

# Save to CSV in prompts folder
prompts_dir = '../prompts'
os.makedirs(prompts_dir, exist_ok=True)
output_path = os.path.join(prompts_dir, 'example_exclude_LB.csv')
examples_lb.to_csv(output_path, index=False)

print(f"Examples saved to: {output_path}")

# Also save as formatted text file for better readability
with open(os.path.join(prompts_dir, 'example_exclude_LB.txt'), 'w', encoding='utf-8') as f:
    f.write("10 Example Abstracts that were excluded (Loecherbach Dataset)\n\n")
    
    for idx, row in examples_lb.iterrows():
        f.write(f"EXAMPLE {idx}\n")
        f.write("-" * 80 + "\n")
        f.write(f"ID: {row['ID']}\n")
        f.write(f"TITLE: {row['title_full']}\n")
        f.write("\nABSTRACT:\n")
        f.write(f"{row['abstract']}\n")
        f.write("=" * 80 + "\n\n")

print(f"Examples also saved as formatted text file: {os.path.join(prompts_dir, 'example_exclude_LB.txt')}")

# Print verification of abstract lengths
print("\nAbstract length statistics for selected examples:")
print(examples_lb['abstract'].str.len().describe())

Examples saved to: ../prompts/example_exclude_LB.csv
Examples also saved as formatted text file: ../prompts/example_exclude_LB.txt

Abstract length statistics for selected examples:
count     10.000
mean    1163.400
std      345.299
min      579.000
25%      985.000
50%     1261.500
75%     1391.000
max     1662.000
Name: abstract, dtype: float64


## Now for the BM dataset

In [6]:
# Define the data paths for both datasets
DATA_PATH_1 = "../data/SSOT_manual_BM_20250707_150309.csv"


# Load the first dataset (df1)
try:
    df_BM = pd.read_csv(DATA_PATH_1)
    print(f"✓ First dataset loaded successfully")
    print(f"✓ Shape of dataset 1: {df_BM.shape}")
except FileNotFoundError:
    print("❌ Error: The file 'ids.csv' was not found in the manual_data directory")
except Exception as e:
    print(f"❌ Error loading the first dataset: {str(e)}")


# Display basic information about both datasets
print("\nDataset 1 Info:\n")
print(df_BM.info())
print("\nFirst few rows of dataset 1:\n")
display(df_BM.head())


✓ First dataset loaded successfully
✓ Shape of dataset 1: (917, 13)

Dataset 1 Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917 entries, 0 to 916
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (internal) id   917 non-null    int64  
 1   (source) id     917 non-null    int64  
 2   abstract        910 non-null    object 
 3   title           917 non-null    object 
 4   journal         917 non-null    object 
 5   authors         171 non-null    object 
 6   tags            0 non-null      float64
 7   consensus       917 non-null    object 
 8   labeled_at...9  0 non-null      float64
 9   code            917 non-null    int64  
 10  stage_1         917 non-null    bool   
 11  stage_2         917 non-null    bool   
 12  stage_3         917 non-null    bool   
dtypes: bool(3), float64(2), int64(3), object(5)
memory usage: 74.5+ KB
None

First few rows of dataset 1:



Unnamed: 0,(internal) id,(source) id,abstract,title,journal,authors,tags,consensus,labeled_at...9,code,stage_1,stage_2,stage_3
0,33937314,175,There is a worry that serious forms of politic...,Is Context the Key? The (Non-)Differential Eff...,Polit. Commun.,,,o,,-1,True,False,False
1,33937315,113,The electoral model of democracy holds the ide...,POLITICAL NEWS IN ONLINE AND PRINT NEWSPAPERS ...,Digit. Journal.,,,o,,-1,True,False,False
2,33937316,122,Machine learning is a field at the intersectio...,Machine Learning for Sociology,Annu. Rev. Sociol.,,,o,,-1,True,False,False
3,33937317,467,Research on digital glocalization has found th...,Improving Health in Low-Income Communities Wit...,J. Commun.,,,o,,-1,True,False,False
4,33937318,10,Political scientists often wish to classify do...,Using Word Order in Political Text Classificat...,Polit. Anal.,,,o,,-1,True,False,False


In [7]:
# Get examples where stage_2 is False from BM dataset
import pandas as pd
import os

# Load the dataset
df_BM = pd.read_csv('../data/SSOT_manual_BM_20250707_150309.csv')

stage2_false_examples = df_BM[df_BM['stage_2'] == False]

# Get 10 random examples with their abstracts using correct column names
examples_bm = stage2_false_examples[['(internal) id', 'title', 'abstract']].sample(n=10)

# Save to CSV in prompts folder
prompts_dir = '../prompts'
os.makedirs(prompts_dir, exist_ok=True)
output_path = os.path.join(prompts_dir, 'example_exclude_BM.csv')
examples_bm.to_csv(output_path, index=False)

print(f"Examples saved to: {output_path}")

# Also save as formatted text file for better readability
with open(os.path.join(prompts_dir, 'example_exclude_BM.txt'), 'w', encoding='utf-8') as f:
    f.write("10 Example Abstracts that were excluded (Birkenmaier Dataset)\n\n")
    
    for idx, row in examples_bm.iterrows():
        f.write(f"EXAMPLE {idx}\n")
        f.write("-" * 80 + "\n")
        f.write(f"ID: {row['(internal) id']}\n")
        f.write(f"TITLE: {row['title']}\n")
        f.write("\nABSTRACT:\n")
        f.write(f"{row['abstract']}\n")
        f.write("=" * 80 + "\n\n")

print(f"Examples also saved as formatted text file: {os.path.join(prompts_dir, 'example_exclude_BM.txt')}")

# Print verification of abstract lengths
print("\nAbstract length statistics for selected examples:")
print(examples_bm['abstract'].str.len().describe())

Examples saved to: ../prompts/example_exclude_BM.csv
Examples also saved as formatted text file: ../prompts/example_exclude_BM.txt

Abstract length statistics for selected examples:
count     10.000
mean    1103.200
std      231.218
min      724.000
25%      931.500
50%     1132.500
75%     1272.250
max     1424.000
Name: abstract, dtype: float64
