In [1]:
%load_ext dotenv
%dotenv
%load_ext autoreload
%autoreload 2

In [2]:
import os 
import pandas as pd
from df_llm_analyzer import DataFrameLLMAnalyzer
from pydantic import BaseModel, Field
from typing import List, Dict
from datasets import load_dataset
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain_core.output_parsers import StrOutputParser


import pandas as pd
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.output_parsers import JsonOutputParser


from langchain.prompts import ChatPromptTemplate
from langchain_community.callbacks import get_openai_callback
from langchain_openai import ChatOpenAI
from loguru import logger
from tqdm import tqdm



# Load Articles 

In [3]:
df_articles = pd.read_csv("../files/articles_to_process.csv")
df_articles.reset_index(drop=False, inplace=True)
df_articles.rename(columns={'article_content': 'text', 'index': 'id'}, inplace=True)
df_articles

Unnamed: 0,id,topic,text,publication_date,headline
0,0,Ukraine,"NATO Wants to Show Support for Ukraine, but On...","April 4, 2024","NATO Wants to Show Support for Ukraine, but On..."
1,1,Ukraine,\nA Ukrainian Army soldier in a forest near Ru...,"April 2, 2024","Ukraine’s Arms Industry Is Growing, but Is It ..."
2,2,Ukraine,Dwindling Ammunition Stocks Pose Grave Threat ...,"April 5, 2024",Dwindling Ammunition Stocks Pose Grave Threat ...
3,3,Ukraine,"""A Russian plane shot down with a Patriot miss...","Feb. 8, 2024",Ukraine’s Creative Use of Weapons Carries Prom...
4,4,Ukraine,Invoking World War II on the 80th anniversary ...,"Feb. 2, 2023","As Russia Strikes Ukrainian Civilians, Putin T..."


# Source Identifier

In [4]:
source_identifier_template = """
**Instructions for Research Assistant**:

Your mission is to meticulously identify and extract all sources cited in news articles, focusing on both direct and indirect mentions. Capture every mentioned source, paying close attention to the geographical context of each source when available. It is crucial to include only mentions with identifiable sources. Whenever a quote is given, ensure to cite it every time it occurs, even if a source name is mentioned more than one time. If a mention does not clearly reference a source, please do not include it.

**Key Points to Follow**:

1. **Extract and Organize Multiple Sources**: Your goal is to diligently read through the article to identify all instances of direct quotes and indirect references. For each source, include:
   - The source or document name.
   - Their title or the nature of the document.
   - The country of origin if mentioned. Valid options for countries are: US, Ukraine, Russia, and Others.
   - **A comprehensive excerpt that includes the actual statement or information attributed to the source.**

2. **Response Format**: Provide structured responses for each identified source to ensure clarity and completeness. This includes:
   - **Direct Quotes**: Include the full sentence or paragraph where the source directly provides information or opinion.
   - **Indirect References**: Specify the section of text that discusses the findings, opinions, or information attributed to the source, including the context of the mention.
   - **Country Extraction**: Clearly note the country of origin for each source, if mentioned, choosing from the provided valid options: US, Ukraine, Russia, and Others.

3. **Enhanced Examples**:
   - Direct: Dr. Wei, Energy Minister of Russia, stated, "Renewable energy is our future."
     - Source: Dr. Wei (Energy Minister, Russia)
     - Country: Russia
     - Excerpt: "Renewable energy is our future."
   - Indirect: According to the 2020 Global Health Report produced in Switzerland, there has been a significant increase in global health awareness.
     - Source: 2020 Global Health Report (Document, Switzerland)
     - Country: Others
     - Excerpt: "The report highlights a significant increase in global health awareness."

Ensure that the excerpts you provide reflect meaningful content that directly relates to each source’s contribution to the article. If a mention in the article does not include direct information or quotes from the source, it should not be documented.

**Article Content**:
>>>
{text}
<<<
"""


In [5]:
from pydantic import BaseModel, Field
from typing import List

class SourceIdentifierFunction(BaseModel):
    """
    Detailed information about all the sources found, including the source name, the associated excerpts from the news article, 
    and the countries of origin for these sources.
    """
    source_names: List[str] = Field(
        ..., 
        description="List of source names, which can be a person, country, document, organization, anonymous, or unnamed source, etc."
    )
    excerpts: List[str] = Field(
        ..., 
        description="List of excerpts from the article where the sources are mentioned. These can be direct quotes or indirect references, ideally with the geographical context of the sources when available."
    )
    countries: List[str] = Field(
        ..., 
        description="List of countries associated with each source, reflecting the geographical context and origin of the sources when available. This list should correspond one-to-one with the source names list. Valid countries are: 'US', 'Ukraine', 'Russia', and 'Others'."
    )


In [6]:
model_name = "gpt-4-0125-preview"
temperature = 0
prompt_template = source_identifier_template
analysis_function = convert_pydantic_to_openai_function(SourceIdentifierFunction)
df_fields = ['text']
prompt = ChatPromptTemplate.from_template(prompt_template)
model = ChatOpenAI(model_name=model_name, temperature=temperature)
model = model.bind(functions=[analysis_function], function_call={"name": analysis_function["name"]})
analysis_chain = prompt | model | JsonOutputParser()

  warn_deprecated(


In [25]:
df_analyzer = DataFrameLLMAnalyzer(   
        model_name = model_name,
        temperature = temperature,
        prompt_template=prompt_template,
        analysis_function=SourceIdentifierFunction,
        df_fields=['text'],
        results_path = "/Users/carlosmorales/Desktop/Carlos/personal_projects/afganistan_news/results/source_analyzer_results.csv"
        )

[32m2024-04-07 15:07:02.850[0m | [34m[1mDEBUG   [0m | [36mdf_llm_analyzer[0m:[36m__init__[0m:[36m59[0m - [34m[1mInitialized LanguageModelDataFrameAnalyzer with model: gpt-4-0125-preview, temperature: 0[0m


In [26]:
df_results = df_analyzer.analyze(df=df_articles)
df_results

[32m2024-04-07 15:07:03.527[0m | [1mINFO    [0m | [36mdf_llm_analyzer[0m:[36manalyze[0m:[36m71[0m - [1mNumber of rows to process: 5[0m
[32m2024-04-07 15:07:03.531[0m | [1mINFO    [0m | [36mdf_llm_analyzer[0m:[36m_analyze_in_batches[0m:[36m118[0m - [1mNo new rows to analyze.[0m


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,topic,text,publication_date,headline,source_names,excerpts,countries
0,0,0.0,0,Ukraine,"NATO Wants to Show Support for Ukraine, but On...","April 4, 2024","NATO Wants to Show Support for Ukraine, but On...","['Jens Stoltenberg', 'senior Western diplomats...",['NATO has no appetite for taking on a new mem...,"['Others', 'Others', 'US', 'Ukraine', 'US', 'U..."
1,1,1.0,1,Ukraine,\nA Ukrainian Army soldier in a forest near Ru...,"April 2, 2024","Ukraine’s Arms Industry Is Growing, but Is It ...","['Tyler Hicks', 'United States', 'Ukrainian Ar...",['A Ukrainian Army soldier in a forest near Ru...,"['Others', 'US', 'Ukraine', 'US', 'Ukraine', '..."
2,2,2.0,2,Ukraine,Dwindling Ammunition Stocks Pose Grave Threat ...,"April 5, 2024",Dwindling Ammunition Stocks Pose Grave Threat ...,"['Capt. Vladyslav Slominsky', 'Johan Norberg',...","['“Artillery decides battles,” said Capt. Vlad...","['Ukraine', 'Others', 'US', 'US', 'Others', 'R..."
3,3,,3,Ukraine,"""A Russian plane shot down with a Patriot miss...","Feb. 8, 2024",Ukraine’s Creative Use of Weapons Carries Prom...,"['U.S. officials', 'European partner', 'Russia...",['A Russian plane shot down with a Patriot mis...,"['US', 'Others', 'Russia', 'US', 'Others', 'US..."
4,4,,4,Ukraine,Invoking World War II on the 80th anniversary ...,"Feb. 2, 2023","As Russia Strikes Ukrainian Civilians, Putin T...","['Mr. Putin', 'U.S. and other Western official...",['Invoking World War II on the 80th anniversar...,"['Russia', 'US', 'Others', 'Russia', 'Others',..."


In [27]:
df_results['source_names'].tolist()

["['Jens Stoltenberg', 'senior Western diplomats', 'Ivo H. Daalder', 'Dmytro Kuleba', 'Antony J. Blinken', 'Christopher G. Cavoli', 'Donald J. Trump']",
 "['Tyler Hicks', 'United States', 'Ukrainian Army soldier', 'C.I.A.', 'Gen. Serhii Dvoretskiy', 'Ivan Bakanov', 'William J. Burns', 'President Vladimir V. Putin', 'Valeriy Kondratiuk', 'John O. Brennan', 'Valentyn Nalyvaichenko', 'Malaysia Airlines Flight 17', 'Kyrylo Budanov', 'Petro Poroshenko', 'General Kyrylo Budanov', 'Joseph R. Biden Jr.', 'Arsen Pavlov', 'Mikhail Tolstykh', 'Maksim Shapoval', 'Marie Yovanovitch', 'Donald J. Trump', 'Mike Pompeo', 'John Bolton', 'Volodymyr Zelensky']",
 "['Capt. Vladyslav Slominsky', 'Johan Norberg', 'Mike Johnson', 'Mike Turner', 'Nicole Tung', 'President Vladimir V. Putin', 'Institute for the Study of War', 'Sgt. Oleksandr Andriyenko', 'President Volodymyr Zelensky', 'Petr Pavel']",
 "['U.S. officials', 'European partner', 'Russian officials', 'American officials', 'Western officials', 'Congre

In [28]:
df_results['excerpts'].tolist()

["['NATO has no appetite for taking on a new member that, because of the alliance’s covenant of collective security, would draw it into the biggest land war in Europe since 1945.', 'What that will be has so far proven elusive, according to senior Western diplomats involved in the discussions.', '“The situation on the ground may look a lot worse than it is today, and then the real question becomes, ‘How do we make sure that Russia doesn’t win?’” said Ivo H. Daalder, a former American ambassador to NATO.', '“I didn’t want to spoil the birthday party for NATO, but I felt compelled to deliver a very sobering message on behalf of Ukrainians about the state of Russian air attacks on my country, destroying our energy system, our economy, killing civilians,” Mr. Kuleba said Thursday at NATO headquarters in Brussels.', 'On Thursday, Secretary of State Antony J. Blinken addressed the issue only by praising the current, American-led process for its “extraordinary results.”', 'Mr. Stoltenberg adde

In [29]:
df_results['countries'].tolist()

["['Others', 'Others', 'US', 'Ukraine', 'US', 'US', 'US']",
 "['Others', 'US', 'Ukraine', 'US', 'Ukraine', 'US', 'Russia', 'Ukraine', 'Ukraine', 'Others', 'Ukraine', 'Ukraine', 'Ukraine', 'US', 'Russia', 'Ukraine', 'Ukraine', 'US', 'US', 'US', 'Ukraine']",
 "['Ukraine', 'Others', 'US', 'US', 'Others', 'Russia', 'US', 'Ukraine', 'Ukraine', 'Others']",
 "['US', 'Others', 'Russia', 'US', 'Others', 'US', 'Ukraine', 'Ukraine', 'Ukraine', 'Ukraine', 'Ukraine']",
 "['Russia', 'US', 'Others', 'Russia', 'Others', 'Ukraine', 'Others', 'Others', 'Others', 'Others', 'Others', 'Others', 'Others']"]

In [30]:
df_results['country_counts'] = df_results['countries'].apply(lambda x: {
    "US": x.count('US'),
    "Ukraine": x.count('Ukraine'),
    "Russia": x.count('Russia'),
    "Others": x.count('Others')
})

df_results['country_percentages'] = df_results['countries'].apply(lambda x: {
    "US": x.count('US') / len(eval(x)) * 100 if len(x) > 0 else 0,
    "Ukraine": x.count('Ukraine') / len(eval(x)) * 100 if len(x) > 0 else 0,
    "Russia": x.count('Russia') / len(eval(x)) * 100 if len(x) > 0 else 0,
    "Others": x.count('Others') / len(eval(x)) * 100 if len(x) > 0 else 0,
})

In [31]:
df_results = df_results.join(df_results['country_percentages'].apply(pd.Series))


In [32]:
df_results

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,topic,text,publication_date,headline,source_names,excerpts,countries,country_counts,country_percentages,US,Ukraine,Russia,Others
0,0,0.0,0,Ukraine,"NATO Wants to Show Support for Ukraine, but On...","April 4, 2024","NATO Wants to Show Support for Ukraine, but On...","['Jens Stoltenberg', 'senior Western diplomats...",['NATO has no appetite for taking on a new mem...,"['Others', 'Others', 'US', 'Ukraine', 'US', 'U...","{'US': 4, 'Ukraine': 1, 'Russia': 0, 'Others': 2}","{'US': 57.14285714285714, 'Ukraine': 14.285714...",57.142857,14.285714,0.0,28.571429
1,1,1.0,1,Ukraine,\nA Ukrainian Army soldier in a forest near Ru...,"April 2, 2024","Ukraine’s Arms Industry Is Growing, but Is It ...","['Tyler Hicks', 'United States', 'Ukrainian Ar...",['A Ukrainian Army soldier in a forest near Ru...,"['Others', 'US', 'Ukraine', 'US', 'Ukraine', '...","{'US': 7, 'Ukraine': 10, 'Russia': 2, 'Others'...","{'US': 33.33333333333333, 'Ukraine': 47.619047...",33.333333,47.619048,9.52381,9.52381
2,2,2.0,2,Ukraine,Dwindling Ammunition Stocks Pose Grave Threat ...,"April 5, 2024",Dwindling Ammunition Stocks Pose Grave Threat ...,"['Capt. Vladyslav Slominsky', 'Johan Norberg',...","['“Artillery decides battles,” said Capt. Vlad...","['Ukraine', 'Others', 'US', 'US', 'Others', 'R...","{'US': 3, 'Ukraine': 3, 'Russia': 1, 'Others': 3}","{'US': 30.0, 'Ukraine': 30.0, 'Russia': 10.0, ...",30.0,30.0,10.0,30.0
3,3,,3,Ukraine,"""A Russian plane shot down with a Patriot miss...","Feb. 8, 2024",Ukraine’s Creative Use of Weapons Carries Prom...,"['U.S. officials', 'European partner', 'Russia...",['A Russian plane shot down with a Patriot mis...,"['US', 'Others', 'Russia', 'US', 'Others', 'US...","{'US': 3, 'Ukraine': 5, 'Russia': 1, 'Others': 2}","{'US': 27.27272727272727, 'Ukraine': 45.454545...",27.272727,45.454545,9.090909,18.181818
4,4,,4,Ukraine,Invoking World War II on the 80th anniversary ...,"Feb. 2, 2023","As Russia Strikes Ukrainian Civilians, Putin T...","['Mr. Putin', 'U.S. and other Western official...",['Invoking World War II on the 80th anniversar...,"['Russia', 'US', 'Others', 'Russia', 'Others',...","{'US': 1, 'Ukraine': 1, 'Russia': 2, 'Others': 9}","{'US': 7.6923076923076925, 'Ukraine': 7.692307...",7.692308,7.692308,15.384615,69.230769


In [33]:
df_results = df_results[['topic', 'text', 'publication_date', 'headline', 'source_names', 'excerpts', 'US', 'Ukraine', 'Russia', 'Others']]

In [34]:
df_results

Unnamed: 0,topic,text,publication_date,headline,source_names,excerpts,US,Ukraine,Russia,Others
0,Ukraine,"NATO Wants to Show Support for Ukraine, but On...","April 4, 2024","NATO Wants to Show Support for Ukraine, but On...","['Jens Stoltenberg', 'senior Western diplomats...",['NATO has no appetite for taking on a new mem...,57.142857,14.285714,0.0,28.571429
1,Ukraine,\nA Ukrainian Army soldier in a forest near Ru...,"April 2, 2024","Ukraine’s Arms Industry Is Growing, but Is It ...","['Tyler Hicks', 'United States', 'Ukrainian Ar...",['A Ukrainian Army soldier in a forest near Ru...,33.333333,47.619048,9.52381,9.52381
2,Ukraine,Dwindling Ammunition Stocks Pose Grave Threat ...,"April 5, 2024",Dwindling Ammunition Stocks Pose Grave Threat ...,"['Capt. Vladyslav Slominsky', 'Johan Norberg',...","['“Artillery decides battles,” said Capt. Vlad...",30.0,30.0,10.0,30.0
3,Ukraine,"""A Russian plane shot down with a Patriot miss...","Feb. 8, 2024",Ukraine’s Creative Use of Weapons Carries Prom...,"['U.S. officials', 'European partner', 'Russia...",['A Russian plane shot down with a Patriot mis...,27.272727,45.454545,9.090909,18.181818
4,Ukraine,Invoking World War II on the 80th anniversary ...,"Feb. 2, 2023","As Russia Strikes Ukrainian Civilians, Putin T...","['Mr. Putin', 'U.S. and other Western official...",['Invoking World War II on the 80th anniversar...,7.692308,7.692308,15.384615,69.230769


In [35]:
df_results.to_csv("/Users/carlosmorales/Desktop/Carlos/personal_projects/afganistan_news/results/final_results.csv", index=False)

In [40]:
df_results['excerpts'][0]

"['NATO has no appetite for taking on a new member that, because of the alliance’s covenant of collective security, would draw it into the biggest land war in Europe since 1945.', 'What that will be has so far proven elusive, according to senior Western diplomats involved in the discussions.', '“The situation on the ground may look a lot worse than it is today, and then the real question becomes, ‘How do we make sure that Russia doesn’t win?’” said Ivo H. Daalder, a former American ambassador to NATO.', '“I didn’t want to spoil the birthday party for NATO, but I felt compelled to deliver a very sobering message on behalf of Ukrainians about the state of Russian air attacks on my country, destroying our energy system, our economy, killing civilians,” Mr. Kuleba said Thursday at NATO headquarters in Brussels.', 'On Thursday, Secretary of State Antony J. Blinken addressed the issue only by praising the current, American-led process for its “extraordinary results.”', 'Mr. Stoltenberg added