# Dataset Analysis: Finance-Instruct-500k

This notebook loads and explores the [Josephgflowers/Finance-Instruct-500k](https://huggingface.co/datasets/Josephgflowers/Finance-Instruct-500k) dataset to prepare training/validation subsets for fine-tuning.


In [None]:
from datasets import load_dataset
import pandas as pd


## Load Dataset


In [None]:
dataset = load_dataset("Josephgflowers/Finance-Instruct-500k")
dataset


## Basic Exploration


In [None]:
# Convert to DataFrame for easier exploration
df = dataset["train"].to_pandas()
print(f"Shape: {df.shape}")
df.head()


In [None]:
# Column info
df.info()


In [None]:
# Sample a few examples
for i, row in df.sample(3).iterrows():
    print("=" * 80)
    for col in df.columns:
        val = row[col]
        if isinstance(val, str) and len(val) > 500:
            val = val[:500] + "..."
        print(f"\n{col.upper()}:\n{val}")


## Identify Use Cases

The dataset supports 8 use cases (QA, Reasoning, Conversational AI, NER, Sentiment, Topic Classification, LLM Training, RAG). We'll use heuristics to classify entries.


In [None]:
def classify_task(row):
    """Heuristic classification based on prompt content"""
    system = str(row.get('system', '')).lower()
    user = str(row.get('user', '')).lower()
    assistant = str(row.get('assistant', '')).lower()
    combined = system + ' ' + user
    
    # Topic Classification patterns
    if any(x in combined for x in ['classify', 'categorize', 'topic', 'category']):
        if any(x in combined for x in ['topic', 'category', 'categories']):
            return 'topic_classification'
    
    # Sentiment Analysis patterns
    if any(x in combined for x in ['sentiment', 'bullish', 'bearish', 'positive', 'negative']):
        return 'sentiment_analysis'
    
    # NER patterns
    if any(x in combined for x in ['entity', 'entities', 'ner', 'extract', 'xbrl', 'tag']):
        return 'ner'
    
    # QA patterns
    if any(x in combined for x in ['question', 'answer', 'what is', 'explain', 'define']):
        return 'qa'
    
    # Reasoning patterns
    if any(x in combined for x in ['calculate', 'compute', 'analyze', 'reasoning', 'portfolio']):
        return 'reasoning'
    
    # RAG patterns (external context prepended)
    if len(user) > 1000 and 'context' in combined:
        return 'rag'
    
    return 'other'

df['task_type'] = df.apply(classify_task, axis=1)
df['task_type'].value_counts()


## Sentiment Analysis Subset


In [None]:
# Filter for sentiment analysis examples
sentiment_df = df[df['task_type'] == 'sentiment_analysis']
print(f"Sentiment analysis samples: {len(sentiment_df)}")
sentiment_df.head(10)


In [None]:
# Examine sentiment analysis examples
for i, row in sentiment_df.sample(min(5, len(sentiment_df))).iterrows():
    print("=" * 80)
    print(f"SYSTEM:\n{row['system'][:300] if row['system'] else 'N/A'}")
    print(f"\nUSER:\n{row['user'][:300]}")
    print(f"\nASSISTANT:\n{row['assistant'][:200]}")


In [None]:
# Group by distinct assistant answers
sentiment_df['assistant'].value_counts()


In [None]:
# Create cleansed dataset with only valid sentiment labels
valid_labels = ['neutral', 'positive', 'negative', 'bullish']
sentiment_clean_df = sentiment_df[sentiment_df['assistant'].isin(valid_labels)].copy()

print(f"Original: {len(sentiment_df)} rows")
print(f"Cleansed: {len(sentiment_clean_df)} rows")
print(f"\nLabel distribution:")
sentiment_clean_df['assistant'].value_counts()


## Create Balanced Training Dataset (6000 samples)


In [None]:
# Add word count columns
sentiment_clean_df['user_words'] = sentiment_clean_df['user'].str.split().str.len()
sentiment_clean_df['assistant_words'] = sentiment_clean_df['assistant'].str.split().str.len()
sentiment_clean_df['total_words'] = sentiment_clean_df['user_words'] + sentiment_clean_df['assistant_words']

# Filter by max word count (~400 words â‰ˆ 512 tokens)
max_words = 400
filtered_df = sentiment_clean_df[sentiment_clean_df['total_words'] <= max_words].copy()

print(f"After length filter: {len(filtered_df)} rows")
print(f"Per label:\n{filtered_df['assistant'].value_counts()}")


In [None]:
# Remove near-duplicates for diversity (based on first 100 chars of user text)
filtered_df['user_key'] = filtered_df['user'].str[:100]
deduped_df = filtered_df.drop_duplicates(subset=['user_key', 'assistant'])

print(f"After deduplication: {len(deduped_df)} rows")
print(f"Per label:\n{deduped_df['assistant'].value_counts()}")


In [None]:
# Balanced sample: 1500 from each label (6000 total)
final_samples = []
for label in ['neutral', 'positive', 'negative', 'bullish']:
    label_df = deduped_df[deduped_df['assistant'] == label]
    n_sample = min(1500, len(label_df))
    sampled = label_df.sample(n=n_sample, random_state=42)
    final_samples.append(sampled)
    print(f"{label}: sampled {n_sample}")

final_df = pd.concat(final_samples, ignore_index=True)

# Drop temp columns
final_df = final_df.drop(columns=['user_key', 'task_type', 'user_words', 'assistant_words', 'total_words'])
print(f"\nFinal dataset: {len(final_df)} rows")


In [None]:
# Final dataset stats
final_df['user_words'] = final_df['user'].str.split().str.len()
final_df['total_words'] = final_df['user_words'] + final_df['assistant'].str.split().str.len()

print("=== Final Dataset Stats ===")
print(f"Total rows: {len(final_df)}")
print(f"\nLabel distribution:\n{final_df['assistant'].value_counts()}")
print(f"\nWord count stats:")
print(final_df['total_words'].describe())
print(f"\nUser text word count stats:")
print(final_df['user_words'].describe())


In [None]:
# Preview a few examples from each label
for label in ['neutral', 'positive', 'negative', 'bullish']:
    print(f"\n{'='*60}\n{label.upper()} EXAMPLE:\n{'='*60}")
    row = final_df[final_df['assistant'] == label].sample(1).iloc[0]
    print(f"USER ({len(row['user'].split())} words):\n{row['user'][:400]}")
    print(f"\nASSISTANT: {row['assistant']}")


## Export Training Dataset


In [None]:
# Export to CSV (only core columns)
output_path = "sentiment_training_data.csv"
export_df = final_df[['system', 'user', 'assistant']].copy()
export_df.to_csv(output_path, index=False)

print(f"Exported {len(export_df)} rows to {output_path}")
