# This notebook is for creating the csvs for the inclusion and exclusion criteria

## We start with the criteria for stage one for the Leocherbach SLR

In [13]:
import pandas as pd
import os

# Create the data directory if it doesn't exist
criteria_dir = '../data'
os.makedirs(criteria_dir, exist_ok=True)

# Create a sample criteria DataFrame
criteria_data = {
    'criterion_id': [],  
    'type': [],         
    'description': [],  
    'rationale': [],    
    'examples': []  ,
    'notes': []     
}

criteria_df = pd.DataFrame(criteria_data)

In [14]:
# Function to add a criterion
def add_criterion(df, criterion_id, type_, description, rationale="", examples=""):
    new_row = pd.DataFrame({
        'criterion_id': [criterion_id],
        'type': [type_],
        'description': [description],
        'rationale': [rationale],
        'examples': [examples]
    })
    return pd.concat([df, new_row], ignore_index=True)

# Add all criteria with optimized formatting for LLM processing
criteria_df = add_criterion(
    criteria_df,
    'IC1',
    'inclusion',
    'Empirical or theoretical research',
    'Must be published in peer-reviewed journals or conference proceedings in English. Both empirical studies and theoretical articles are acceptable.',
    'Abstract indicates empirical research or theoretical framework development'
)

criteria_df = add_criterion(
    criteria_df,
    'IC2',
    'inclusion',
    'Media content or market analysis',
    'Studies must analyze non-fictional media content or media market structures. Focus on news media, non-fictional content, and market analysis.',
    'INCLUDE: news media, non-fictional TV shows, news articles | EXCLUDE: fictional TV shows, movies, games'
)

criteria_df = add_criterion(
    criteria_df,
    'IC3',
    'inclusion',
    'Diversity analysis in media',
    'Analysis must focus on diversity in media content (newspapers, television, radio, social media) or media market components (pluralism, news outlets, newspaper bias).',
    'KEY TERMS: filter bubbles, echo chambers, serendipity, pluralism, entropy, newsroom diversity, media market diversity'
)

criteria_df = add_criterion(
    criteria_df,
    'EC1',
    'exclusion',
    'Non-peer-reviewed content',
    'Exclude non-peer-reviewed materials, meta-analyses, literature reviews, books, dissertations, editorials, and non-proceedings conference papers.',
    'EXCLUDE: not peer-reviewed, meta-analyses, literature reviews, book, dissertation, editorials, conference paper (not in official conference proceedings)'
)

criteria_df = add_criterion(
    criteria_df,
    'EC2',
    'exclusion',
    'No media content/market focus',
    'Study has no clear connection to media content or media market analysis.',
    'EXCLUDE: "The paint is applied on different media such as canvas and wood" - wrong use of term media'
)

criteria_df = add_criterion(
    criteria_df,
    'EC3',
    'exclusion',
    'Media as context only',
    'Media (including social networks, newspapers) are mentioned but not analyzed.',
    'EXCLUDE: "The politician received media attention, but we analyze their speeches" - media not central to analysis'
)

criteria_df = add_criterion(
    criteria_df,
    'EC4',
    'exclusion',
    'Fictional media content',
    'Study focuses on fictional media content rather than news or factual content.',
    'EXCLUDE: "Diversity of actors in Oscar-premiered movies" - focuses on fictional content'
)

criteria_df = add_criterion(
    criteria_df,
    'EC5',
    'exclusion',
    'No diversity focus',
    'No explicit reference to diversity concepts (including filter bubbles, echo chambers, serendipity, entropy, pluralism).',
    'EXCLUDE: Studies using terms like "diversion" or "pluralistic ignorance" without diversity focus'
)

criteria_df = add_criterion(
    criteria_df,
    'EC6',
    'exclusion',
    'Non-media diversity focus',
    'Diversity analysis focuses on non-media aspects.',
    'EXCLUDE: "diverse sample of respondents", "diverse media consumption contexts" | INCLUDE: "diversity of sources in news articles"'
)

criteria_df = add_criterion(
    criteria_df,
    'EC7',
    'exclusion',
    'Peripheral diversity mention',
    'Diversity-related concepts are mentioned but not central to analysis.',
    'EXCLUDE: "Future research should address media diversity", "newspapers struggle in pluralized market" - not primary focus'
)

criteria_df = add_criterion(
    criteria_df,
    'EC8',
    'exclusion',
    'Superficial diversity reference',
    'Diversity-related terms used without substantive analysis or explanation.',
    'EXCLUDE: "polarized pluralist model" or "social media echo chamber" mentioned without analysis'
)

# Display all criteria
print("\nOptimized criteria for LLM processing:")
display(criteria_df)


Optimized criteria for LLM processing:


Unnamed: 0,criterion_id,type,description,rationale,examples,notes
0,IC1,inclusion,Empirical or theoretical research,Must be published in peer-reviewed journals or...,Abstract indicates empirical research or theor...,
1,IC2,inclusion,Media content or market analysis,Studies must analyze non-fictional media conte...,"INCLUDE: news media, non-fictional TV shows, n...",
2,IC3,inclusion,Diversity analysis in media,Analysis must focus on diversity in media cont...,"KEY TERMS: filter bubbles, echo chambers, sere...",
3,EC1,exclusion,Non-peer-reviewed content,"Exclude non-peer-reviewed materials, meta-anal...","EXCLUDE: not peer-reviewed, meta-analyses, lit...",
4,EC2,exclusion,No media content/market focus,Study has no clear connection to media content...,"EXCLUDE: ""The paint is applied on different me...",
5,EC3,exclusion,Media as context only,"Media (including social networks, newspapers) ...","EXCLUDE: ""The politician received media attent...",
6,EC4,exclusion,Fictional media content,Study focuses on fictional media content rathe...,"EXCLUDE: ""Diversity of actors in Oscar-premier...",
7,EC5,exclusion,No diversity focus,No explicit reference to diversity concepts (i...,"EXCLUDE: Studies using terms like ""diversion"" ...",
8,EC6,exclusion,Non-media diversity focus,Diversity analysis focuses on non-media aspects.,"EXCLUDE: ""diverse sample of respondents"", ""div...",
9,EC7,exclusion,Peripheral diversity mention,Diversity-related concepts are mentioned but n...,"EXCLUDE: ""Future research should address media...",


In [15]:
# Save the criteria to prompts folder
prompts_dir = '../prompts'
os.makedirs(prompts_dir, exist_ok=True)
prompts_path = os.path.join(prompts_dir, 'Criteria_LB_01.csv')

# Save to prompts folder
criteria_df.to_csv(prompts_path, index=False)
print(f"\nCriteria also saved to: {prompts_path}")


Criteria also saved to: ../prompts/Criteria_LB_01.csv


## Now we proceed with the other study from Schwabl et al

In [19]:
# Create a sample criteria DataFrame
criteria_data = {
    'criterion_id': [],  
    'type': [],         
    'description': [],  
    'rationale': [],    
    'examples': [],
    'notes': []     

}

criteria_df = pd.DataFrame(criteria_data)

In [21]:
criteria_df.head()

Unnamed: 0,criterion_id,type,description,rationale,examples
0,IC1,inclusion,Automated computational agents,Study must employ automated computational agen...,INCLUDE: Automated bots/scripts that browse pl...
1,IC2,inclusion,Experimental design for algorithmic curation,Study must investigate algorithmic content cur...,INCLUDE: Testing search engines with controlle...
2,EC1,exclusion,Human-dependent studies,Studies that rely on human participants or req...,"EXCLUDE: User studies, surveys, interviews, ma..."
3,EC2,exclusion,Non-experimental or theoretical only,Studies that either lack experimental design o...,"EXCLUDE: Literature reviews, methodology propo..."
4,EC3,exclusion,Static/offline analysis,Studies that analyze content or systems withou...,"EXCLUDE: Static content analysis, offline data..."


In [20]:
# Refined criteria for Agent-Based Testing (ABT) studies
criteria_df = pd.DataFrame()  # Start fresh

criteria_df = add_criterion(
    criteria_df,
    'IC1',
    'inclusion',
    'Automated computational agents',
    'Study must employ automated computational agents that interact with media environments following pre-specified instructions/recipes, without requiring human intervention during execution',
    'INCLUDE: Automated bots/scripts that browse platforms, programmatic interaction with content systems | EXCLUDE: Manual testing, human-operated tools'
)

criteria_df = add_criterion(
    criteria_df,
    'IC2',
    'inclusion',
    'Experimental design for algorithmic curation',
    'Study must investigate algorithmic content curation/personalization through controlled experimental design with systematic variable manipulation and outcome measurement',
    'INCLUDE: Testing search engines with controlled variables, investigating recommendation systems with defined manipulations | KEY ELEMENTS: Variable manipulation (device, location, queries), measurable outcomes'
)

criteria_df = add_criterion(
    criteria_df,
    'EC1',
    'exclusion',
    'Human-dependent studies',
    'Studies that rely on human participants or require human intervention during the testing process',
    'EXCLUDE: User studies, surveys, interviews, manual testing procedures, human-operated testing tools'
)

criteria_df = add_criterion(
    criteria_df,
    'EC2',
    'exclusion',
    'Non-experimental or theoretical only',
    'Studies that either lack experimental design or only discuss ABT conceptually without implementation',
    'EXCLUDE: Literature reviews, methodology proposals without implementation, purely observational studies, system descriptions without testing'
)

criteria_df = add_criterion(
    criteria_df,
    'EC3',
    'exclusion',
    'Static/offline analysis',
    'Studies that analyze content or systems without active interaction with live algorithmic curation systems',
    'EXCLUDE: Static content analysis, offline data analysis, historical data analysis without live testing'
)

criteria_df = add_criterion(
    criteria_df,
    'EC4',
    'exclusion',
    'Insufficient ABT implementation',
    'Studies that claim to use ABT but lack either proper agent automation, variable manipulation, or clear methodology description',
    'EXCLUDE: Studies without clear agent specifications, missing experimental protocols, single-condition tests without variable manipulation'
)

# Display the refined criteria
print("\nRefined Agent-Based Testing Study Criteria:")
display(criteria_df)


Refined Agent-Based Testing Study Criteria:


Unnamed: 0,criterion_id,type,description,rationale,examples
0,IC1,inclusion,Automated computational agents,Study must employ automated computational agen...,INCLUDE: Automated bots/scripts that browse pl...
1,IC2,inclusion,Experimental design for algorithmic curation,Study must investigate algorithmic content cur...,INCLUDE: Testing search engines with controlle...
2,EC1,exclusion,Human-dependent studies,Studies that rely on human participants or req...,"EXCLUDE: User studies, surveys, interviews, ma..."
3,EC2,exclusion,Non-experimental or theoretical only,Studies that either lack experimental design o...,"EXCLUDE: Literature reviews, methodology propo..."
4,EC3,exclusion,Static/offline analysis,Studies that analyze content or systems withou...,"EXCLUDE: Static content analysis, offline data..."
5,EC4,exclusion,Insufficient ABT implementation,Studies that claim to use ABT but lack either ...,EXCLUDE: Studies without clear agent specifica...


In [22]:
criteria_df['notes'] = ''

In [23]:
criteria_df.head()

Unnamed: 0,criterion_id,type,description,rationale,examples,notes
0,IC1,inclusion,Automated computational agents,Study must employ automated computational agen...,INCLUDE: Automated bots/scripts that browse pl...,
1,IC2,inclusion,Experimental design for algorithmic curation,Study must investigate algorithmic content cur...,INCLUDE: Testing search engines with controlle...,
2,EC1,exclusion,Human-dependent studies,Studies that rely on human participants or req...,"EXCLUDE: User studies, surveys, interviews, ma...",
3,EC2,exclusion,Non-experimental or theoretical only,Studies that either lack experimental design o...,"EXCLUDE: Literature reviews, methodology propo...",
4,EC3,exclusion,Static/offline analysis,Studies that analyze content or systems withou...,"EXCLUDE: Static content analysis, offline data...",


In [24]:
# Save the criteria to prompts folder
prompts_dir = '../prompts'
os.makedirs(prompts_dir, exist_ok=True)
output_path = os.path.join(prompts_dir, 'Criteria_SB_01.csv')

# Save to CSV
criteria_df.to_csv(output_path, index=False)
print(f"\nCriteria saved to: {output_path}")


Criteria saved to: ../prompts/Criteria_SB_01.csv


## We need to switch from the Schwabl et al study to the Birkenmaier study due to missing abstracts, thats why we also make an criteria csv for that one

In [25]:
# Create DataFrame structure for criteria
criteria_data = {
    'criterion_id': [],  
    'type': [],         
    'description': [],  
    'rationale': [],    
    'examples': [],
    'notes': []      
}

criteria_df = pd.DataFrame(criteria_data)

In [26]:
# Add inclusion criteria
criteria_df = add_criterion(
    criteria_df,
    'IC1',
    'inclusion',
    'Use of Computational Text Analysis Methods (CTAM)',
    'Study must use algorithms or automated software tools to analyze textual data (written or spoken language). The analysis should involve computational methods beyond simple word counting.',
    'INCLUDE: Machine learning for text classification, automated content analysis, topic modeling | EXCLUDE: Manual content analysis, purely qualitative text analysis'
)

criteria_df = add_criterion(
    criteria_df,
    'IC2',
    'inclusion',
    'Analysis of social science constructs',
    'Study must measure at least one latent social science construct through CTAM. Constructs can range from abstract to multidimensional concepts.',
    'INCLUDE: Sentiment analysis, ideology measurement, communication style analysis, populism detection | EXCLUDE: Simple frequency counts, basic descriptive statistics'
)

criteria_df = add_criterion(
    criteria_df,
    'IC3',
    'inclusion',
    'Political communication focus',
    'Study must address empirical, theory-driven research questions in political communication.',
    'INCLUDE: Analysis of political discourse, media framing of political issues, political ideology detection | EXCLUDE: Technical method papers without political communication focus'
)

# Add exclusion criteria
criteria_df = add_criterion(
    criteria_df,
    'EC1',
    'exclusion',
    'Non-computational methods',
    'Studies that do not use computational methods for text analysis or rely primarily on manual coding.',
    'EXCLUDE: Manual content analysis, purely qualitative discourse analysis, human coding without computational components'
)

criteria_df = add_criterion(
    criteria_df,
    'EC2',
    'exclusion',
    'No social science construct measurement',
    'Studies that do not measure any latent social science constructs or only perform basic text statistics.',
    'EXCLUDE: Word frequency counts only, readability metrics only, simple descriptive text statistics'
)

criteria_df = add_criterion(
    criteria_df,
    'EC3',
    'exclusion',
    'Methodological focus only',
    'Studies that focus solely on developing or comparing CTAM methods without substantive research questions.',
    'EXCLUDE: Papers only comparing different algorithms, technical validation studies without empirical application'
)

criteria_df = add_criterion(
    criteria_df,
    'EC4',
    'exclusion',
    'Non-political communication',
    'Studies that apply CTAM outside the field of political communication.',
    'EXCLUDE: Marketing text analysis, literary analysis, general linguistics studies without political focus'
)


In [27]:
# Save to CSV
prompts_dir = '../prompts'
os.makedirs(prompts_dir, exist_ok=True)
output_path = os.path.join(prompts_dir, 'Criteria_BM_01.csv')
criteria_df.to_csv(output_path, index=False)

print(f"\nCriteria saved to: {output_path}")


Criteria saved to: ../prompts/Criteria_BM_01.csv
