In [14]:
# Testing LLM classifyer with ROAR Section B 


from langchain import PromptTemplate
from langchain.chat_models import PromptLayerChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.docstore.document import Document
import os
import promptlayer
import pandas as pd


gpt_model = "gpt-4"


openai_api_key = "Key"
promptlayer_api_key= "Key"


# Set the PromptLayer API key
promptlayer.api_key = promptlayer_api_key

# Assuming 'promptlayer.openai' is the correct way to access the OpenAI configuration within promptlayer,
# set the OpenAI API key
promptlayer.openai.api_key = openai_api_key





In [15]:
#Parameters
#Select the csv filename for modelling
filename = "test.csv"
topic_filename = "Topic.csv"

In [16]:
# Read data 
df = pd.read_csv(filename)
captions = df['Caption'].iloc[1:]
doc_captions = [Document(page_content=t) for t in captions]

In [17]:
# Read Existing Topics from CSV
df = pd.read_csv(topic_filename)
existing_topics = df["topics"]

if len(existing_topics)!=1:
    topics_combined = '\n\n'.join(existing_topics.astype(str))
else:
    topics_combined = ''
    
print(f"Topics List: {topics_combined}")

Topics List: Geopolitical tensions

Covid

diseases

Low economic diversification

environmental deterioration

disaster

poverty

political instability

deterioration of the security situation

pandemic

Lack of funding

Inflation

volatile economic growth.

Gender Based Violene

Low human capacity


In [20]:
# Map Prompt

llm = PromptLayerChatOpenAI(model=gpt_model, pl_tags=["Classifier"], openai_api_key="Key")
map_template = """ Prompt
"""

map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm,prompt=map_prompt)


# Function to extract challenges
def extract_challenges(output):
    if 'challenges' in output:
        return output['challenges']
    elif 'text' in output:
        challenges_text = output['text']
        return challenges_text.split('\n')
    return []

# Placeholder for DataFrame rows
rows = []

# Extracting text content from each Document
captions = [doc.page_content for doc in doc_captions]

# Process each caption individually
for caption in captions:
    # Adjust this line to correctly invoke your model for a single caption and capture the output
    output = map_chain.invoke({"captions": caption})
    
    challenges = extract_challenges(output)
    
    # Add caption and its challenges to the rows list
    row = [caption] + challenges
    rows.append(row)

# Determine the maximum number of columns needed
max_cols = max(len(row) for row in rows)

# Create column headers
columns = ['Caption'] + [f'Challenge {i+1}' for i in range(1, max_cols)]

# Create DataFrame
df = pd.DataFrame(rows, columns=columns)

# Fill any missing values with empty strings
df.fillna('', inplace=True)

# Save the DataFrame to CSV
csv_filename = 'captions_and_individual_challenges.csv'
df.to_csv(csv_filename, index=False)

In [22]:
#Reduce Prompt and Chain
#The following prompt is for the "reduce" step of the algorithm. 
# It operates against the entire set of output that is produced by the "map" chain.
#In our example, the map chain yields a set of topics that are defined on the caption it was run on. These are then grouped together and passed as a result to this reduce chain. This reduce chain is designed to take this global output of the map step and reduce it down to a final set of unique fertility topics that minimized contextual overlap.                                                                                                                                
reduce_template = """Prompt
"""

output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()


reduce_prompt = PromptTemplate(template=reduce_template,input_variables=["topics"],partial_variables={"format_instructions":format_instructions})
#reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain=LLMChain(llm=llm,prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="topics"
)

reduce_documents_chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=combine_documents_chain,
    token_max=4000)


map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=map_chain,
    reduce_documents_chain=reduce_documents_chain,
    document_variable_name="captions",
    return_intermediate_steps=False)

In [23]:
#Run Map-Reduce Chain
#in this step we run the actual MapReduceDocumentsChain, 
# this will then begin to process all of the captions that were read in from the CSV file and perform the
# "map" step on each of them. Once the "map" step is completed opn all of them it will then run the final
#"reduce" step and output a final list of topics.

output = map_reduce_chain.invoke(doc_captions)


In [28]:
# Printing out the New Topics and Inspect total number of topics
text_to_parse = output['output_text']

# Use the parsing method to split the string into a list of topics
new_topics = text_to_parse.split(", ")

# Display the results
print(f"Generated {len(new_topics)} new topics")
for topic in new_topics:
    print(topic)

Generated 31 new topics
Political Instability
Socio-economic Challenges
Impacts of External Shocks
Macroeconomic Degradation
Limited Economic Growth
Effects of Climate Change
Difficulties in Resource Mobilization
Security Concerns
Governance Challenges
Low Human Development Index
Urban and Monetary Poverty
Budget Deficit and Social Spending
High Public Debt
Inflation Control
Access to International Financing and Global Demand Uncertainty
GDP per Capita
Human Capacity Development
Environmental Challenges
Internal Displacement and Humanitarian Needs
Terrorism
Democratic Process Absence
Constitutional Revisions
Financial Constraints
Economic Diversification
Market Downturns
Private Sector Development
Trade and Investment Opportunities
Policy and Regulatory Framework Implementation
Data and Evidence Availability
Impact Assessments
Application of Foresight.


In [39]:
#In the following step, we use a new separate chain to eliminate any new topics which are already covered or similar to topics that were read in at the start of processing.

eliminate_duplicates_template = """Prompt
""" 


prompt_template = PromptTemplate(template=eliminate_duplicates_template, input_variables=["new_topics","existing_topics"],partial_variables={"format_instructions":format_instructions})
llm_chain = LLMChain(llm=llm,prompt=prompt_template)

In [40]:
# Running the model to merge duplicated topics and for summary

prompt_template = PromptTemplate(template=eliminate_duplicates_template, input_variables=["new_topics","existing_topics"],partial_variables={"format_instructions":format_instructions})
llm_chain = LLMChain(llm=llm,prompt=prompt_template)
output2 = llm_chain.predict(new_topics=output,existing_topics=topics_combined)


In [41]:
# check output 
print(output2)

`Socio-economic Challenges, Impacts of External Shocks, Macroeconomic Degradation, Limited Economic Growth, Difficulties in Resource Mobilization, Governance Challenges, Urban and Monetary Poverty, Budget Deficit and Social Spending, High Public Debt, Inflation Control, Access to International Financing and Global Demand Uncertainty, GDP per Capita, Human Capacity Development, Internal Displacement and Humanitarian Needs, Terrorism, Democratic Process Absence, Constitutional Revisions, Financial Constraints, Market Downturns, Private Sector Development, Trade and Investment Opportunities, Policy and Regulatory Framework Implementation, Data and Evidence Availability, Impact Assessments, Application of Foresight.`


In [43]:
# Print New Topics
new_topics = output_parser.parse(output2)
print(f"Generated final set of {len(new_topics)} topics.")
print(new_topics)


Generated final set of 25 topics.
['`Socio-economic Challenges', 'Impacts of External Shocks', 'Macroeconomic Degradation', 'Limited Economic Growth', 'Difficulties in Resource Mobilization', 'Governance Challenges', 'Urban and Monetary Poverty', 'Budget Deficit and Social Spending', 'High Public Debt', 'Inflation Control', 'Access to International Financing and Global Demand Uncertainty', 'GDP per Capita', 'Human Capacity Development', 'Internal Displacement and Humanitarian Needs', 'Terrorism', 'Democratic Process Absence', 'Constitutional Revisions', 'Financial Constraints', 'Market Downturns', 'Private Sector Development', 'Trade and Investment Opportunities', 'Policy and Regulatory Framework Implementation', 'Data and Evidence Availability', 'Impact Assessments', 'Application of Foresight.`']


In [44]:
#Merge the existing topics list with the new set of topics
existing_topics_lower = [item.lower() for item in existing_topics]
new_topics_lower = [item.lower() for item in new_topics]

merged_topics_list_lower = existing_topics_lower + new_topics_lower
unique_topics_list_lower = list(set(merged_topics_list_lower))

final_topics_list = []
for item in unique_topics_list_lower:
    if item in existing_topics_lower:
        final_topics_list.append(existing_topics[existing_topics_lower.index(item)])
    elif item in new_topics_lower:
        final_topics_list.append(new_topics[new_topics_lower.index(item)])

print(final_topics_list)

df = pd.DataFrame({'topics':final_topics_list})
df.to_csv(topic_filename,index=False)

['diseases', 'environmental deterioration', 'Difficulties in Resource Mobilization', 'deterioration of the security situation', 'Application of Foresight.`', 'Policy and Regulatory Framework Implementation', 'Market Downturns', 'Democratic Process Absence', 'volatile economic growth.', 'Low human capacity', 'Constitutional Revisions', 'Covid', 'Inflation', 'Financial Constraints', 'Budget Deficit and Social Spending', 'Gender Based Violene', 'poverty', 'political instability', 'Inflation Control', 'Low economic diversification', 'Private Sector Development', 'Human Capacity Development', 'Internal Displacement and Humanitarian Needs', 'Lack of funding', 'Trade and Investment Opportunities', 'Urban and Monetary Poverty', 'Terrorism', 'Macroeconomic Degradation', 'disaster', 'GDP per Capita', 'Governance Challenges', '`Socio-economic Challenges', 'Access to International Financing and Global Demand Uncertainty', 'High Public Debt', 'pandemic', 'Limited Economic Growth', 'Impact Assessmen