### Extract ctg, creteria, examples from directory --> dict_ctg

In [None]:
       # Ensure your OpenAI API key is set
import pandas as pd
import json
import os

from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI


os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
openai_api_key = os.getenv("OPENAI_API_KEY")

In [None]:
                   
model_name="gpt-4o"
#gpt-3.5-turbo

temperature=0.7
max_tokens=100

In [5]:
import os
import re


directory = 'pol_criteria_examples/txtFiles'  # Replace with the path to your directory

# Loop through all files in the directory
dict_ctg= {}
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    
    # Check if it's a file (and not a directory)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            # Read the content of the file
            content = file.read()
            
            # Extract the first word
            first_word = content.split()[0].replace(':', '')
            
            # Regular expression patterns for each section
            patterns = {
                "Left": r"Left:\n(.*?)\n\n",
                "Center": r"Center:\n(.*?)\n\n",
                "Right": r"Right:\n(.*?)\n\n",
                "Chain_of_thought_examples": r"Chain_of_thought_examples:\n(.*)"
            }

            # Initialize the dictionary
            output_dict = {}

            # Extract content for each section
            for key, pattern in patterns.items():
                match = re.search(pattern, content, re.DOTALL)
                if match:
                    output_dict[key] = match.group(1).strip()
                    
            # category info:
            dict_ctg[first_word]= output_dict



In [6]:
dict_ctg

{'Football': {'Left': "Football news articles with a left-leaning perspective often emphasize themes of social justice, equality, and inclusivity. They may focus on stories that highlight the role of football in promoting diversity, such as initiatives to increase representation of women, ethnic minorities, and LGBTQ+ individuals in the sport. These articles may also criticize practices within football that perpetuate inequality or exploitation, such as the treatment of migrant workers in host countries for major tournaments. Additionally, left-leaning articles might advocate for players' rights, support for grassroots football, and the importance of community engagement through the sport. Economic aspects discussed might include calls for fairer distribution of revenue within leagues and support for lower-tier clubs.",
  'Center': 'Center-leaning football news articles typically strive for a balanced perspective, presenting multiple viewpoints on issues within the sport. These article

### gpt_annotation function

In [7]:
 def gpt_response(category, criteria_string, Examples, text):

    prompt_template ="""
    You will be provided with a news article about {category}. 
    Your role is to determine the political lean of the articles. 
    Use the following this step-by-step approach:
    
    1. Identify key points in the article.
    2. Determine political lean indicators in the article based on the criteria: {criteria_string}
    3. Conclude the overall political lean.
    
    Examples: {Examples}
    
    Article : {text}
    
    The output should first specify the political lean (Left, Center, or Right), 
    followed by a brief description of the political lean indicators, limited to 50 words.
    Please respond in the exact format provided below:
    
    Lean: 
    Description: 
    """
    
 # Create the prompt template
    prompt = PromptTemplate(input_variables=["category", "criteria_string", "Examples", "text"], template=prompt_template)
    # Initialize the ChatOpenAI model
    llm = ChatOpenAI(model_name=model_name, openai_api_key=openai_api_key, temperature=0, max_tokens=max_tokens)
    # Create the LLMChain with the prompt and the LLM
    chain = LLMChain(prompt=prompt, llm=llm)
    result = chain.run(category=category, criteria_string=criteria_string, Examples=Examples, text=text)
    return result.strip()


# x= gpt_annotation(category, criteria_string, Examples, text)

# x

## MIND: Perform annotation and create new df with annotation and description

In [8]:
def annotation_MIND(df):
    # Initial empty DataFrame with specified columns
    df_new = pd.DataFrame(columns=["news", "category", "text", "Politics", "Description"])

    # Loop through each row using iterrows()
    for index, row in df.iterrows():
        # Accessing row data
        category = row['topic']
        text = row['text']

        #Extract creteria 
        small_dict= dict_ctg[category.capitalize()]
        criteria_string=""
        for k, v in small_dict.items(): 
            if k != "Chain_of_thought_examples":
                criteria_string= criteria_string+ k + ": "+ v + "\n"

        # eXTRACT eXAMPLES
        Examples= small_dict["Chain_of_thought_examples"]

        # Call annotation Function
        response= gpt_response(category, criteria_string, Examples, text)

        # Define the list of specific keys in the order they should be assigned
        specific_keys = ["Politics", "Description"]

        # Initialize the output dictionary
        output_dict = {}

        # Split the input string by newline character and remove empty strings from the list
        lines = [line for line in response.strip().split('\n') if line]

        # Iterate through each line and split by the colon character to form key-value pairs
        for index, line in enumerate(lines):
            if ': ' in line:
                _, value = line.split(': ', 1)
                value = value.strip()

                # Use the specific key based on the current index
                specific_key = specific_keys[index]
                output_dict[specific_key] = value



        #Extract id, topic, text to add it to output_dict
        output_dict["news"]= row['news']
        output_dict["category"]= row['topic']
        output_dict["text"]= row['text']
        # arrange them
        # Define the specific order of keys
        key_order = ["news", "category", "text", "Politics", "Description"]
        # Create a new dictionary in the specified order
        rearranged_dict = {key: output_dict[key] for key in key_order}

        #Add it to the new df
        new_row = pd.DataFrame([rearranged_dict], columns=df_new.columns)
        df_new= pd.concat([df_new, new_row], ignore_index=True)
    return df_new

    
    #extracting creteria and examples
    

In [None]:
csv_upd= "data/MIND_news.csv"

# Read CSV file into pandas DataFrame
df = pd.read_csv(csv_upd)
df

In [53]:
import numpy as np

# Split the DataFrame into 10 smaller DataFrames
dfs = np.array_split(df, 10)

# Displaying the number of rows in each smaller DataFrame
split_dfs_info = pd.DataFrame([{"DataFrame": f"df_{i+1}", "Rows": len(dfs[i])} for i in range(10)])

# Print the information about the split DataFrames
print(split_dfs_info)


  DataFrame  Rows
0      df_1  5127
1      df_2  5127
2      df_3  5126
3      df_4  5126
4      df_5  5126
5      df_6  5126
6      df_7  5126
7      df_8  5126
8      df_9  5126
9     df_10  5126


In [115]:
df_new= annotation_MIND(dfs[9])

In [116]:
df_n= df_new.copy()

In [117]:
df_n

Unnamed: 0,news,category,text,Politics,Description
0,N27940,politics,Democrats discussing multiple articles of impe...,Left,The article focuses on House Democrats discuss...
1,N57753,football,Joey Yellen suffers early shoulder injury and ...,Center,The article focuses on the factual reporting o...
2,N46251,baseball,UNI basketball: What we learned from the Panth...,Center,"The article focuses on game analysis, team per..."
3,N59000,basketball,"Cubs' biggest division rivals, the Brewers and...",Center,The article focuses on the competitive dynamic...
4,N59529,moresports,"Suns Solar Panel, ep. 167: Stay calm and Monty...",Center,The article focuses on the performance and ach...
...,...,...,...,...,...
5121,N16909,weather,"Adapting, Learning And Soul Searching: Reflect...",Center,The article focuses on the impact of the Wools...
5122,N47585,lifestyle,Family says 13-year-old Broadway star died fro...,Center,The article focuses on a human interest story ...
5123,N7482,moresports,St. Dominic soccer player tries to kick cancer...,Center,The article focuses on a human interest story ...
5124,N34418,moresports,"How the Sounders won MLS Cup. Mark, Jeremiah a...",Center,The article focuses on the performance and ach...


In [118]:
# Check for rows with NaN values
rows_with_nan = df_n.isna().any(axis=1)

# Count the number of rows with NaN values
num_rows_with_nan = rows_with_nan.sum()
num_rows_with_nan

0

In [119]:
path= "politics/"
file= path + "df_10.csv"
df_n.to_csv(file, index=False)

In [120]:
df_n = pd.read_csv(file)
df_n

Unnamed: 0,news,category,text,Politics,Description
0,N27940,politics,Democrats discussing multiple articles of impe...,Left,The article focuses on House Democrats discuss...
1,N57753,football,Joey Yellen suffers early shoulder injury and ...,Center,The article focuses on the factual reporting o...
2,N46251,baseball,UNI basketball: What we learned from the Panth...,Center,"The article focuses on game analysis, team per..."
3,N59000,basketball,"Cubs' biggest division rivals, the Brewers and...",Center,The article focuses on the competitive dynamic...
4,N59529,moresports,"Suns Solar Panel, ep. 167: Stay calm and Monty...",Center,The article focuses on the performance and ach...
...,...,...,...,...,...
5121,N16909,weather,"Adapting, Learning And Soul Searching: Reflect...",Center,The article focuses on the impact of the Wools...
5122,N47585,lifestyle,Family says 13-year-old Broadway star died fro...,Center,The article focuses on a human interest story ...
5123,N7482,moresports,St. Dominic soccer player tries to kick cancer...,Center,The article focuses on a human interest story ...
5124,N34418,moresports,"How the Sounders won MLS Cup. Mark, Jeremiah a...",Center,The article focuses on the performance and ach...


In [81]:
df_n["Politics"].value_counts()

Politics
Center    4502
Left       433
Right      191
Name: count, dtype: int64