In [14]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Create a financial transactions dataset with additional columns
df = pd.DataFrame({
    'Customer Name': ['John', 'Jane', 'Bob', 'Alice', 'Tom', 'Amy'],
    'Transaction Amount': [2000, 5000, 3000, 4000, 1500, 1000],
    'Loan Amount': [1000, 2000, 1500, 2500, 1000, 500],
    'Loan Return Time': [30, 45, 60, 90, 120, 180],
    'Reason for Delay': [
        'I lost my job and couldn\'t afford to pay back the loan on time.',
        'I had to pay for my child\'s medical expenses, which delayed my loan repayment.',
        'I invested the loan amount in a business that failed, which affected my cash flow.',
        'I was out of the country and couldn\'t make the payment on time.',
        'I had to pay for unexpected car repairs, which affected my finances.',
        'I had a family emergency that required me to use the loan amount for something else.'
    ]
})

# Define a function to generate insights and suggestions using GPT-2
def generate_insights(df):
    # Define the input text for the model
    input_text = f"The financial transactions dataset contains {len(df)} rows. "

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate text using the model
    output = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)

    # Decode the output text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the insights and suggestions from the generated text
    text_sections = generated_text.split('\n\n')
    insights = text_sections[0]
    suggestions = text_sections[1] if len(text_sections) > 1 else ""

    # Return the insights and suggestions
    return insights, suggestions

# Generate insights and suggestions from the dataset
insights, suggestions = generate_insights(df)

# Print the results
print("Insights:")
print(insights)
print("Suggestions:")
print(suggestions)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Insights:
The financial transactions dataset contains 6 rows.  The first row is the amount of money in the account.  The second row is the amount of money in the account.  The third row is the amount of money in the account.  The fourth row is the amount of money in the account.  The fifth row is the amount of money in the account.  The sixth row is the amount of money in the account.  The seventh row is the amount
Suggestions:



In [26]:
import random
import csv
from datetime import datetime, timedelta

# Define the list of possible transaction types
transaction_types = ["Loan", "Credit Card Payment", "Investment", "Withdrawal", "Deposit"]

# Define the list of possible reasons for delay
reasons_for_delay = ["Family emergency", "Vacation", "Health issues", "Overspending", "Job loss", "Education expenses", "Home renovation", "Divorce", "Unexpected car repairs", "Medical bills"]

# Define the start and end dates for the dataset
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

# Define the number of rows to generate
num_rows = 1000

# Generate the dataset
with open("financial_transactions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "customer_name", "transaction_date", "transaction_type", "transaction_amount", "days_to_return", "reason_for_delay"])
    for i in range(num_rows):
        # Generate random values for each row
        id = i+1
        customer_name = "Customer " + str(random.randint(1, 100))
        transaction_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        transaction_type = random.choice(transaction_types)
        transaction_amount = random.randint(100, 100000)
        days_to_return = random.randint(0, 90)
        reason_for_delay = random.choice(reasons_for_delay)
        # Write the row to the file
        writer.writerow([id, customer_name, transaction_date.strftime("%Y-%m-%d"), transaction_type, transaction_amount, days_to_return, reason_for_delay])


In [27]:


import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer

# Load the dataset into a Pandas dataframe
df = pd.read_csv("financial_transactions.csv")

# Define the GPT-2 model and tokenizer
model_name = "gpt2-medium"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Define the maximum length of the generated text
max_length = 512

# Define the prompt for the GPT-2 model
prompt = f"The financial transactions dataset contains {len(df)} records. The total amount of money involved in the transactions is {df['transaction_amount'].sum():,.2f} dollars. The most common reason for delayed payments is {df['reason_for_delay'].value_counts().index[0]}. The average number of days to return a payment is {df['days_to_return'].mean():.2f} days. What other insights can we gather from this dataset?"

# Encode the prompt and generate the output
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids=input_ids, max_length=max_length, do_sample=True, top_k=50, top_p=0.95)

# Decode the output and print it
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The financial transactions dataset contains 1000 records. The total amount of money involved in the transactions is 50,137,920.00 dollars. The most common reason for delayed payments is Medical bills. The average number of days to return a payment is 46.20 days. What other insights can we gather from this dataset?

What is a Medical bill?

According to the FDA, an average bill can cost between $50-50,000. According to the FDA, in an average year, between $100,000 and $250,000 a year is spent on medical bills. An average person has two bills. One is for surgery and the other is for hospitalization.

An average patient has several medical bills. How many people are doing this?

Here is a list of each patient's average medical bill from different years and regions.

The total amount spent in the year 2012 on medical bills in US is $4,977,300.

Total amount spent in US was $4,977,300.

Median number of days to return a payment for medical bills is 11.

Median number of days to return a pay