# Imports

In [None]:
import os
import openai
import pandas as pd
import requests
import random
from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from dotenv import load_dotenv
import os
from datetime import datetime

import json
import time
import re

# Set environment variables

In [None]:
load_dotenv()

openai.api_key = os.getenv("api_key")
openai.api_base = os.getenv("api_base")
openai.api_type = os.getenv("api_type")
openai.api_version = os.getenv("api_version")
deployment_id = os.getenv("deployment_id_gpt_4")
gpt_model = os.getenv("deployment_id_gpt_4")
model_engine = os.getenv("deployment_id_gpt_4")
embd_model = 'text-embedding-ada-002'

os.environ.update({
    "OPENAI_API_TYPE": os.getenv("api_type"),
    "OPENAI_API_VERSION": os.getenv("api_version"),
    "OPENAI_API_BASE": os.getenv("api_base"),
    "OPENAI_API_KEY": os.getenv("api_key"),
    "BING_API_KEY": os.getenv("BING_SEARCH_V7_SUBSCRIPTION_KEY")
})

bing_api_key = os.getenv("BING_API_KEY")

G_BING_SEARCH_V7_SUBSCRIPTION_KEY = "not_set"
G_BING_SEARCH_V7_ENDPOINT = "not_set"


def set_global_variable(subscription_key, search_url):
    # Use the global keyword to modify the global variable
    global G_BING_SEARCH_V7_SUBSCRIPTION_KEY 
    global G_BING_SEARCH_V7_ENDPOINT
    G_BING_SEARCH_V7_SUBSCRIPTION_KEY = subscription_key
    G_BING_SEARCH_V7_ENDPOINT = search_url

# Serch topics

In [None]:
news_categories = ["Energy Transition", 
                   "Nuclear Energy", 
                   "Oil & Gas",
                   "Renewable Energy",
                    "Solar Energy",
                    ]

In [None]:
## Extracts the text content from the first <h1> tag found in the HTML of the specified URL.
def extract_h1_tag_text(url):
    # Make a GET request to the specified URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = Soup(response.text, 'html.parser')

    # Find all <h1> tags in the parsed HTML
    h1_tags = soup.find_all('h1')

    # Return the text content of the first <h1> tag found, or None if no <h1> tag is present
    for tag in h1_tags:
        return tag.text

## Convert to plain text
def html_to_plain_text(html_string):
    soup = Soup(html_string, 'html.parser')
    plain_text = soup.get_text(separator=' ', strip=True)
    return plain_text

In [None]:
def get_similar_url(search_term):
    subscription_key = G_BING_SEARCH_V7_SUBSCRIPTION_KEY
    search_url = G_BING_SEARCH_V7_ENDPOINT

    # print(f"++++++++++++++++++{search_term}")
    
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}
    params = {"q": search_term, "textDecorations": True, "textFormat": "HTML", "count":20}
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    # print(search_results)

    results=search_results['value']

    names=[x['name'] for x in results]
    urls=[x['url'] for x in results]

    final_urls = []
    for name, url in zip(names, urls):
        name = html_to_plain_text(name)
        final_urls.append("["+name+"]("+url+")")
    # print(f">>>> {len(final_urls)}")
    return final_urls

In [None]:
def gpt_func_for_news_summarization(web_content):
    system_content = """analyze the given text string passed as user input, you have to find the below information.
        information 1. What is the website name from where the input text is read, give the precise answer only.
        information 2. summary_para : summary in paragraph format within 100 words in a professional way. NOTE: if the input text talks about 'Page Not Found', return summary as "not available".
        information 3. summary_bullet : summary in bullet 7 tp 10 bullet points, this should be in str format not in list format in a professional way. NOTE: if the input text talks about 'Page Not Found', return summary as "not available"..
        information 4. category : find out that the input text belongs to which all of the following Five categories, it can be more than one categories as well, if the text does not belongs to any of the five catagories, catagorize it as others. Append the categories as comma seprated string, the final output should be a string not a list. 
            1. Regulatory and Compliance 
                definition of this category: news about government policies, regulations, and legal developments affecting the energy sector.
            2. Sustainability 
                definition of this category: News related to the environmental impact of energy production and efforts toward sustainability 
            3. Corporate News
                definition of this category: Updates and developments from energy companies, mergers and acquisitions 
            4. Financial
                definition of this category: Financial report of organizations. 
            5. Geopolitics
                definition of this category: International energy-related news, collaborations, and geopolitical factors influencing the energy sector.
         information 5. sentiment score: Analyze the input text and find the sentiment score range within -1 to +1 where -1.0 is negative sentiment and +1.0 positive sentiment, make sure the score is generated precisely within -1.0 to +1.0, nothing else.
         information 6. Justify the sentiment score calculated in 50 words, the output should talk about article instead of text and should be in string format.
         information 7. Create a headline within 10 words based on the text.
         information 8. Analyse the input article and determine that this article belongs to which country, make sure to return one country only. If you are not sure about the answer return "Others".
            
        Instructions:
        - Find all four information as per the instruction.
        - Output the results in JSON format as shown in the example.
        - If any of the information is not found return it as 'not available'.
        - The input text can belong to multiple categories, so it should be in list of categories mentioned above.
        - Present the summaries as strings, not in a list.
        - Make sure that the final output is in JSON FORMAT ONLY.
        - If the input text is about page error 404 or page error 403, return summary_para and summary_bullet "not available"
        - sentiment_score should be a number between -1.0 and +1.0 only, if you are not able to determine the score, return it as 0.0
        - NO EXTRA DATA other than JSON in the output.
        - for specific region If the country name is provided in an abbreviated or alternative form, please convert it to the standard format. For example, replace 'UK' with 'United Kingdom' and 'EU' or 'Europe' with 'European Union' and 'US' or 'USA' with 'United States
        - Take your time to generate the outputs.

        Example JSON format:
        {
            "Source" : <website name>,
            "summary_para" : "",
            "summary_bullet" : "",
            "category" : [],
            "sentiment_score" : <sentiment_score>,
            "sentiment_justification" : "",
            "gen_headline" : <headline>,
            "specific_region" : ""
        }
        """    
    conversation = [{"role": "system", "content":system_content}]
    user_input = {"role":"user", "content":web_content}

    conversation.append(user_input)
    try:
        response = openai.ChatCompletion.create(engine=model_engine, messages=conversation, temperature=0.3)
        details = response['choices'][0]['message']['content']
        details = details.strip()
    except:
        details = """{"Source" : "not available",
                       "summary_para": "not available", 
                       "summary_bullet": "not available",  
                       "category": "not available", 
                       "sentiment_score": "not available",
                       "sentiment_justification" : "not available",
                        "gen_headline" : "not available"}"""
    
    return details

In [None]:
## Function to perform Bing News search for a perticular topic
def bing_news_articles(search_term):
    from dotenv import load_dotenv
    load_dotenv()
    subscription_key = G_BING_SEARCH_V7_SUBSCRIPTION_KEY
    search_url = G_BING_SEARCH_V7_ENDPOINT
    
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}
    params = {"q": search_term, "textDecorations": True, "textFormat": "HTML", "freshness": "Week", "originalImg":True, "count":20}
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    # print(search_results)
    return search_results

In [None]:
## Function to create a folder
def create_folder(folder_name):
    # Get the current date in the format YYYY-MM-DD
    # current_date = datetime.now().strftime("%Y-%m-%d")
    current_date = "current"
    current_date = current_date+"_bing_websearch"

    # Create the parent folder with the current date
    parent_folder_path = os.path.join(os.getcwd(), current_date)
    parent_folder_path = parent_folder_path
    os.makedirs(parent_folder_path, exist_ok=True)

    # Create the nested folder with the provided name
    nested_folder_path = os.path.join(parent_folder_path, folder_name)
    
    # Check if the folder already exists
    if os.path.exists(nested_folder_path):
        return "already_exists"
    
    # Create the nested folder
    os.makedirs(nested_folder_path)

    return current_date

In [None]:
## Code to perform web scrapping with timeout
def perform_web_scrapping(url):
    page_content = ""
    try:
        loader = RecursiveUrlLoader(
            url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text, timeout=10
        )
        docs = loader.load()
        page_content = docs[0].page_content
    except:
        page_content = "Timed out !!!!"
    return page_content

In [None]:
## Categorizes news sentiments into positive, negative, or neutral based on the provided sentiment scores.
def categorize_news(sentiments):
    label_news = lambda x: f"{x} (Positive news)" if float(x) >= 0.3 else (f"{x} (Negative news)" if float(x) <= -0.3 else f"{x} (Neutral news)") 
    categorized_list = list(map(label_news, sentiments))
    return categorized_list

In [None]:
def merge_csv_files(input_folder, output_file):
    # Get a list of all subdirectories in the input folder
    subdirectories = [d for d in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, d))]

    # Initialize an empty DataFrame to store the merged data
    merged_data = pd.DataFrame()

    # Loop through each subdirectory
    for subdir in subdirectories:
        # Construct the path to the news_df.csv file in the current subdirectory
        csv_file_path = os.path.join(input_folder, subdir, 'news_df.csv')
        # Check if the file exists
        if os.path.exists(csv_file_path):
            # Read the CSV file into a DataFrame
            df = pd.read_csv(csv_file_path)

            # Merge the data into the main DataFrame
            merged_data = pd.concat([merged_data, df], ignore_index=True)
            
    ############### adding similar URL ######################
    
    df=merged_data.copy()
    
    # df['date'] = df['date'].apply(lambda date: date.split('T')[0])
    df['summary_bullet'] = df['summary_bullet'].apply(lambda x: x.replace('•',''))

    # Write the merged data to a new CSV file
    df.to_csv(output_file, index=False)
    print(f'Merged data saved to {output_file}')

In [None]:
# Remove special characters as it is noticed while performing bing search a special char gives bad results.
def remove_special_characters(input_string):
    # Define a regular expression pattern to match all non-alphanumeric characters except ',' and '.'
    pattern = re.compile('[^a-zA-Z0-9,.\s]')
    # Use the pattern to replace matched characters with an empty string
    result_string = re.sub(pattern, '', input_string)
    return result_string

In [None]:
def get_specific_category_news(current_category):
    # Lists to store information for each news article
    Category = [current_category] * 10
    description = []
    provider = []
    url = []
    source = []
    tag = []
    formatted_time = []
    summary_para = []
    summary_bullet = []
    similar_url = []
    image_url = []
    senti_score = []
    senti_justification = []
    headline_txt = []
    generated_headline = []
    curr_region = []

    news_article_count = 0

    fall_back_images = ["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSxSRYFpmfDLL131rXdBuNFwuZYXqXh7m7pf-PGDDAlHVCK-MtupIHcv9StNPnG5ukIaqE&usqp=CAU",
                        "https://www.financialexpress.com/wp-content/uploads/2023/12/solar2-1.jpg?w=1024",
                        "https://cepa.org/wp-content/uploads/2022/07/2022-03-24T000000Z_753859509_MT1NURPHO000K7054A_RTRMADP_3_GERMANY-ENERGY.jpg",
                        "https://electricalreview.co.uk/wp-content/uploads/2023/12/energy-transition.jpg"]
    
    # create folder of current date name
    folder_name = create_folder(current_category)

    if folder_name == "already_exists":
        print("Today's News is already extracted")
        return
    
    # Search news now
    search_results = bing_news_articles(current_category)
  
    for article in search_results["value"]:
        news_url = article["url"]
        print("\n\n------------------------------------ >>>>>>")
        print(f"{current_category}: {news_url}")

        news_txt = perform_web_scrapping(article["url"])
        
        news_txt_len = len(news_txt)
        print(f"-------- news txt len = {news_txt_len}--------------")
        if news_txt_len < 350:
            print("Since not enough text available, skipping this news article.")
            continue

        print("%%%%%%%%%%%%%")
        print(article)
        print("%%%%%%%%%%%%%")
    
        description.append(article["description"])

        # datePublished.append(article["datePublished"])
        publish_date = pd.to_datetime(article["datePublished"])
        formatted_time_val = publish_date.strftime('%d %B %Y') 
        formatted_time.append(formatted_time_val)

        image_tmp = article.get('image', "not_available")
        # print(image_tmp)
        if image_tmp == "not_available":
            random_number = random.randint(0, 3)
            image_url.append(fall_back_images[random_number])
        else:
            image_url.append(image_tmp["contentUrl"])
            # image_url.append(image_tmp["thumbnail"]["contentUrl"])
        provider.append(article["provider"][0]["name"])
        url.append(article["url"])

        # Call the gpt end point to summarize the 
        news_summary = gpt_func_for_news_summarization(news_txt)
        
        print(news_summary)
        news_summary = json.loads(news_summary)

        if news_summary["summary_para"] == "not available":
            print(f"<<<<<< ------------------------------------{news_article_count} ")
            continue

        headline = extract_h1_tag_text(article["url"])
        if headline is None:
            headline = " "
        headline = headline.strip()

        if len(headline) < 1 or "error" in headline.lower():
            headline = news_summary["gen_headline"]

        headline_txt.append(headline)
        
        similer_url_query_term_tmp = headline + ', ' + formatted_time_val
        similer_url_query_term = remove_special_characters(similer_url_query_term_tmp)
        sim_urls = get_similar_url(similer_url_query_term)
        similar_urls = "\n".join(list(sim_urls[1:4]))
        similar_url.append(similar_urls)  
    
        summary_para.append(news_summary["summary_para"])
        summary_bullet.append(news_summary["summary_bullet"])
        tag.append(news_summary["category"])
        score = categorize_news([news_summary["sentiment_score"]])
        senti_score.append(score[0])
        senti_justification.append(news_summary["sentiment_justification"])
        source.append(news_summary["Source"])
        generated_headline.append(news_summary["gen_headline"])
        curr_region.append(news_summary["specific_region"])

        # just to count how many news articles are successfully processed.
        news_article_count += 1
        print(f"------------------------------------{news_article_count} >>>>>>>>")
        if news_article_count >= 3:
            break
    
    # Create a DataFrame of this data
    news_df = pd.DataFrame(list(zip(Category, description, provider, curr_region, url, tag, formatted_time, headline_txt, generated_headline, summary_para, summary_bullet, image_url, senti_score, senti_justification, similar_url)),
        columns=["Category", "News Article", "News channel Source", "region", "Web link", "tag", "formatted_timestamp", "headline_text", "generated_headline", "summary_para", "summary_bullet", "image_url", "sentiment_score", "sentiment_justification", "similar_url"])
    
    # Write this to folder
    print(news_df)
    news_df.to_csv(".//" + folder_name + "//" + current_category + "//news_df.csv")

    return folder_name

# Main function

In [None]:
if __name__ == "__main__":
    load_dotenv()
    subscription_key = os.getenv("BING_SEARCH_V7_SUBSCRIPTION_KEY")
    search_url = os.getenv("BING_SEARCH_V7_ENDPOINT_NEWS")

    # set global variable
    set_global_variable(subscription_key, search_url)
    
    start_time = time.time()

    for cat in news_categories:
        print(">>> searching for :", cat)
        folder_name = get_specific_category_news(cat)
        # Example usage:
        input_folder = folder_name
        output_file = './/'+folder_name+'//merged_news.csv'
        
        merge_csv_files(input_folder, output_file)
        
        # Record the end time
        end_time = time.time()
        
        # Calculate the elapsed time
        elapsed_time = end_time - start_time
        
    print(f"Execution time: {elapsed_time} seconds")