### Install necessary libraries

In [None]:
# !pip install -q pandas sumy openai

In [None]:
import nltk
nltk.download('punkt')
import pandas as pd
import spacy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import itertools
import random
import openai
openai.api_key = 'API_KEY'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv('/content/drive/MyDrive/sample_input_news.csv')

### Summarization with Sumy

In [None]:
def summarize(text, language="english", sentences_count=5):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

# Example: Summarize the first article
first_article_summary = summarize(df['text'][0], sentences_count=10)

In [None]:
first_article_summary

"Companies like Nvidia, AMD and Microsoft (MSFT) tap the emerging market for generative AI. Generative AI can create content, including written articles, from simple phrases by analyzing vast amounts of data. Investors generally should focus on stocks with Comp Ratings of 90 or even 95 and above. Nvidia stock often earns a spot on the IBD 50, Big Cap 20 and Sector Leaders lists. On Nov. 21, Nvidia disclosed earnings rocketed 593% in the third quarter and revenue soared 206%, an overall beat. For the full year, analysts now expect Nvidia earnings to rebound 264% as sales jump 118%. The fabless chipmaker pioneered graphics processing units, or GPUs, to make video games more realistic. Nvidia's GPUs act as accelerators for central processing units, or CPUs, made by other companies. Earnings should more than triple this fiscal year, driven by booming chip sales for data centers and artificial intelligence. Bottom line: Nvidia stock is not a buy right now, but it could be soon."

#### Apply summarization to data

In [None]:
df['summary'] = df['text'].apply(lambda x: summarize(x, sentences_count=6))

### Generate article combinations

In [None]:
summaries = [(article_id, summarized_text) for article_id, summarized_text in zip(df.index, df['text'])]

# Parameters set by the user
n = 2  # Number of articles to combine for each QA
k = 10  # Number of QA pairs to generate

# Generate random combinations of articles
random_combinations = random.sample(list(itertools.combinations(summaries, n)), k)

### Generate prompt

In [None]:
def generate_prompt_for_gpt(combination):
    articles_text = "\n\n".join([f"Article {i+1} Summary: {text}" for i, (_, text) in enumerate(combination)])
    prompt = (
        f"Based on the combined information from the following summaries, generate a financial reasoning question. The question should be financial and related to the companies mentioned, and formatted to be concise, short and shouldn\'t mention that its based on something or an article.\n" \
        'Do not pick a question that cannot be answered from the given summaries. It should be simple enough and should mimic the kind of financial reasoning questions that an analyst asks.' \
        f"Each question generated must also have an associated answer, and provide a detailed answer that requires inference from multiple articles.\n" \
        "The generated Q/A pair should be generated as a tuple containing the question and the answer." \
        "Your reply should ONLY be the Q/A pair as a tuple, nothing else.\n" \
        f"{articles_text}\n\n"
    )
    return prompt

### Generate QA Pairs and Document References

In [None]:
def generate_qa_pair(prompt):
    response = openai.chat.completions.create(
        model = 'gpt-4',
        messages=[
            {
                'role': 'user',
                'content': prompt,
            },
        ],
        temperature=0.5,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        stop=["\n\n"]
    )
    return response.choices[0].message.content.strip()

### Generate Q/A Dataset

In [None]:
qa_df = pd.DataFrame(columns = ['question','answer', 'references'])

In [None]:
for combination in random_combinations:
    prompt = generate_prompt_for_gpt(combination)
    qa_pair = eval(generate_qa_pair(prompt))
    source_articles = [article_id for article_id, _ in combination]
    print(f'Question: {qa_pair[0]}\n\n')
    print(f'Answer: {qa_pair[1]}\n\n')
    print(f'References: {source_articles}')
    break
    # qa_df.loc[len(qa_df.index)] = {
    #     'question': qa_pair[0],
    #     'answer': qa_pair[1],
    #     'references': source_articles
    # }

In [None]:
qa_df.to_csv('/content/drive/MyDrive/qaDataset.csv', index = False)

In [None]:
qa_df['answer'][0]

"AMD's growth expectations for its AI products are considered to be 'meaningfully better' according to analysts, despite a projected revenue shortfall for the current quarter. It is believed that the success of these AI processors is more significant than the performance of AMD's embedded business which focuses on industrial, automotive and networking sectors. Furthermore, AMD is investing heavily in AI to compete with its rival Nvidia. However, while sales of AMD's AI-powered GPUs are expected to reach $2 billion next year, this is considered small compared to Nvidia. Nvidia's market cap exceeds $1 trillion and its stocks have almost tripled year-to-date, making it the S&P 500's best-performing stock this year."