In [1]:
import json
import pandas as pd
import os
from datetime import datetime
from pathlib import Path

# Set paths for processed data
processed_data_path = "/root/nfs/AJ FinRag/Company Processed Data/all_companies_processed.json"
company_data_dir = "company_data"
llm_data_dir = "llm_data"

def load_processed_data(file_path):
    """Load processed data from JSON file and convert timestamps"""
    print(f"Loading processed data from: {file_path}")
    data = []

    with open(file_path, 'r') as f:
        for line in f:
            record = json.loads(line)
            # Convert Unix timestamp (milliseconds) to datetime
            record['date'] = pd.to_datetime(record['date'], unit='ms')
            data.append(record)

    df = pd.DataFrame(data)

    # Sort by ticker and date to ensure proper order
    df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

    print(f"Loaded {len(df)} records for {df['ticker'].nunique()} companies")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")

    return df

def label_movement(df):
    """Label stock movements as rise, fall, or freeze based on returns"""
    df = df.copy()

    # Calculate daily returns if not present or use existing Returns column
    if 'Returns' in df.columns and df['Returns'].notna().any():
        df['return'] = df['Returns'] * 100  # Convert to percentage
    else:
        df['return'] = df['adj_close'].pct_change() * 100

    def classify(r):
        if pd.isna(r):  # Handle NaN values
            return 'freeze'
        if r > 0.55:
            return 'rise'
        elif r < -0.5:
            return 'fall'
        return 'freeze'

    df['movement'] = df['return'].apply(classify)
    return df

def generate_queries(df, stock_name, start_date, end_date):
    """Generate query sequences for the LLM"""
    df = df.copy()

    # Filter by stock and date range
    df = df[(df['ticker'] == stock_name) &
            (df['date'] >= start_date) &
            (df['date'] <= end_date)].reset_index(drop=True)

    if len(df) <= 5:
        print(f"Warning: {stock_name} has insufficient data ({len(df)} records)")
        return []

    queries = []
    for i in range(10, len(df)):
        query_date = df.loc[i, "date"]
        prev_window = df.loc[i-10:i-1]

        queries.append({
            "data_index": i,
            "query_stock": stock_name,
            "query_date": query_date.strftime('%Y-%m-%d'),
            "recent_date_list": [d.strftime('%Y-%m-%d') for d in prev_window["date"]],
            "adjusted_close_list": [round(float(v), 4) if pd.notna(v) else 0.0 for v in prev_window["adj_close"]],
            "movement": df.loc[i, "movement"]
        })

    return queries

def generate_candidates(df, stock_name, indicators):
    """Generate candidate sequences for the LLM"""
    # Filter by stock
    df = df[df['ticker'] == stock_name].copy().reset_index(drop=True)

    if len(df) <= 6:  # Need at least 6 records for candidates
        print(f"Warning: {stock_name} has insufficient data for candidates ({len(df)} records)")
        return []

    candidates = []

    for i in range(10, len(df) - 1):
        candidate_date = df.loc[i, "date"]
        movement = df.loc[i + 1, "movement"]
        recent_dates = df.loc[i-10:i-1, "date"].tolist()

        for ind in indicators:
            if ind in df.columns and pd.notna(df.loc[i-5:i-1, ind]).any():  # Check if indicator exists and has data
                values = df.loc[i-10:i-1, ind].tolist()
                # Handle NaN values in the list
                clean_values = [round(float(v), 4) if pd.notna(v) else 0.0 for v in values]

                candidates.append({
                    "data_index": len(candidates),
                    "candidate_stock": stock_name,
                    "candidate_date": candidate_date.strftime('%Y-%m-%d'),
                    "candidate_movement": movement,
                    "recent_date_list": [d.strftime('%Y-%m-%d') for d in recent_dates],
                    f"{ind}_list": clean_values,
                    "indicator_name": ind
                })

    return candidates

def save_jsonl(path, data):
    """Save data as JSON lines format"""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

def generate_individual_company_files(df, start_date, end_date, output_dir="company_data"):
    """Generate separate query and candidate files for each company"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Add movement labels to the entire dataset
    print("Adding movement labels...")
    df_with_movement = df.groupby('ticker').apply(label_movement).reset_index(drop=True)

    # Available indicators from your data structure
    indicators = [
        "adj_close", "close", "high", "low", "open", "volume",
        "MACD_Histogram", "VWAP", "alpha_smr", "alpha_mom", "Returns"
    ]

    # Filter indicators that actually exist in the dataframe and have non-null values
    available_indicators = []
    for ind in indicators:
        if ind in df_with_movement.columns:
            non_null_count = df_with_movement[ind].notna().sum()
            if non_null_count > 0:
                available_indicators.append(ind)
                print(f"  {ind}: {non_null_count} non-null values")

    print(f"Available indicators: {available_indicators}")

    tickers = df_with_movement['ticker'].unique()
    print(f"Processing {len(tickers)} companies individually...")

    company_stats = []

    for ticker in tickers:
        print(f"Processing {ticker}...")

        # Generate queries for this ticker
        ticker_queries = generate_queries(df_with_movement, ticker, start_date, end_date)

        # Generate candidates for this ticker
        ticker_candidates = generate_candidates(df_with_movement, ticker, available_indicators)

        if len(ticker_queries) == 0 or len(ticker_candidates) == 0:
            print(f"  Warning: {ticker} has no queries or candidates, skipping...")
            continue

        # Save individual company files
        query_file = os.path.join(output_dir, f"{ticker}_train_qlist.json")
        candidate_file = os.path.join(output_dir, f"{ticker}_train_clist.json")

        save_jsonl(query_file, ticker_queries)
        save_jsonl(candidate_file, ticker_candidates)

        company_stats.append({
            'ticker': ticker,
            'queries': len(ticker_queries),
            'candidates': len(ticker_candidates)
        })

        print(f"  {ticker}: {len(ticker_queries)} queries, {len(ticker_candidates)} candidates")

    # Save summary statistics
    if company_stats:
        stats_df = pd.DataFrame(company_stats)
        stats_df.to_csv(os.path.join(output_dir, "company_data_summary.csv"), index=False)

    print(f"\n✅ Individual company files saved to: {output_dir}")
    print(f"Summary statistics saved to: {os.path.join(output_dir, 'company_data_summary.csv')}")

    return company_stats

def generate_combined_company_data(df, start_date, end_date, output_dir="llm_data"):
    """Generate combined query and candidate data for all companies"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Add movement labels to the entire dataset
    print("Adding movement labels...")
    df_with_movement = df.groupby('ticker').apply(label_movement).reset_index(drop=True)

    # Available indicators from your data structure
    indicators = [
        "adj_close", "close", "high", "low", "open", "volume",
        "MACD_Histogram", "VWAP", "alpha_smr", "alpha_mom", "Returns"
    ]

    # Filter indicators that actually exist in the dataframe and have non-null values
    available_indicators = []
    for ind in indicators:
        if ind in df_with_movement.columns:
            non_null_count = df_with_movement[ind].notna().sum()
            if non_null_count > 0:
                available_indicators.append(ind)

    print(f"Available indicators: {available_indicators}")

    all_queries = []
    all_candidates = []

    tickers = df_with_movement['ticker'].unique()
    print(f"Processing {len(tickers)} companies for combined dataset...")

    for ticker in tickers:
        print(f"Processing {ticker}...")

        # Generate queries for this ticker
        ticker_queries = generate_queries(df_with_movement, ticker, start_date, end_date)
        all_queries.extend(ticker_queries)

        # Generate candidates for this ticker
        ticker_candidates = generate_candidates(df_with_movement, ticker, available_indicators)
        all_candidates.extend(ticker_candidates)

        print(f"  {ticker}: {len(ticker_queries)} queries, {len(ticker_candidates)} candidates")

    print(f"\nTotal: {len(all_queries)} queries, {len(all_candidates)} candidates")

    # Save combined data
    queries_path = os.path.join(output_dir, "all_companies_train_qlist.json")
    candidates_path = os.path.join(output_dir, "all_companies_train_clist.json")

    save_jsonl(queries_path, all_queries)
    save_jsonl(candidates_path, all_candidates)

    print(f"✅ Combined data saved:")
    print(f"  Queries: {queries_path}")
    print(f"  Candidates: {candidates_path}")

    return all_queries, all_candidates

# Main execution
if __name__ == "__main__":
    print("=== Generating LLM Training Data for All Companies ===")

    # Load processed data (timestamps will be converted automatically)
    processed_df = load_processed_data(processed_data_path)

    # Check your data
    print(f"\nDataset shape: {processed_df.shape}")
    print(f"Date range: {processed_df['date'].min()} to {processed_df['date'].max()}")
    print(f"Companies: {processed_df['ticker'].nunique()}")
    print(f"Sample companies: {sorted(processed_df['ticker'].unique())[:10]}")  # Show first 10

    # Show column info
    print(f"\nAvailable columns:")
    for col in processed_df.columns:
        non_null_count = processed_df[col].notna().sum()
        print(f"  {col}: {non_null_count}/{len(processed_df)} non-null values")

    # Generate individual company files
    print("\n=== Generating Individual Company Files ===")
    company_stats = generate_individual_company_files(
        processed_df,
        pd.to_datetime("2022-01-03"),
        pd.to_datetime("2024-12-30"),
        company_data_dir
    )

    # Generate combined dataset
    print("\n=== Generating Combined Dataset ===")
    all_queries, all_candidates = generate_combined_company_data(
        processed_df,
        pd.to_datetime("2022-01-03"),
        pd.to_datetime("2024-12-30"),
        llm_data_dir
    )

    # Display summary statistics
    if company_stats:
        print("\n=== Summary Statistics ===")
        stats_df = pd.DataFrame(company_stats)
        print(f"Total companies processed: {len(stats_df)}")
        print(f"Total queries across all companies: {stats_df['queries'].sum()}")
        print(f"Total candidates across all companies: {stats_df['candidates'].sum()}")
        print(f"Average queries per company: {stats_df['queries'].mean():.1f}")
        print(f"Average candidates per company: {stats_df['candidates'].mean():.1f}")

        # Show top 10 companies by data volume
        print("\nTop 10 companies by query count:")
        top_companies = stats_df.nlargest(10, 'queries')
        for _, row in top_companies.iterrows():
            print(f"  {row['ticker']}: {row['queries']} queries, {row['candidates']} candidates")

    # Show sample data
    if all_queries:
        print("\nSample query:")
        print(json.dumps(all_queries[0], indent=2))

    if all_candidates:
        print("\nSample candidate:")
        print(json.dumps(all_candidates[0], indent=2))

    print("\n✅ All data preparation complete!")
    print("Files generated:")
    print(f"  - Individual company files in '{company_data_dir}/' folder")
    print(f"  - Combined dataset in '{llm_data_dir}/' folder")
    if company_stats:
        print(f"  - Summary statistics in '{company_data_dir}/company_data_summary.csv'")

=== Generating LLM Training Data for All Companies ===
Loading processed data from: /root/nfs/AJ FinRag/Company Processed Data/all_companies_processed.json
Loaded 18800 records for 25 companies
Date range: 2022-01-03 00:00:00 to 2024-12-30 00:00:00

Dataset shape: (18800, 19)
Date range: 2022-01-03 00:00:00 to 2024-12-30 00:00:00
Companies: 25
Sample companies: ['AAPL', 'ADBE', 'AMD', 'AMZN', 'BAC', 'CRM', 'CVX', 'GOOGL', 'HD', 'INTC']

Available columns:
  date: 18800/18800 non-null values
  ticker: 18800/18800 non-null values
  open: 18800/18800 non-null values
  high: 18800/18800 non-null values
  low: 18800/18800 non-null values
  close: 18800/18800 non-null values
  volume: 18800/18800 non-null values
  adj_close: 18800/18800 non-null values
  MACD_Histogram: 18800/18800 non-null values
  macd_crossover: 18800/18800 non-null values
  bollinger_bands: 1938/18800 non-null values
  exceeding_upper: 1036/18800 non-null values
  exceeding_lower: 902/18800 non-null values
  overbought_a

  df_with_movement = df.groupby('ticker').apply(label_movement).reset_index(drop=True)


  AAPL: 742 queries, 8131 candidates
Processing ADBE...
  ADBE: 742 queries, 8131 candidates
Processing AMD...
  AMD: 742 queries, 8131 candidates
Processing AMZN...
  AMZN: 742 queries, 8131 candidates
Processing BAC...
  BAC: 742 queries, 8131 candidates
Processing CRM...
  CRM: 742 queries, 8131 candidates
Processing CVX...
  CVX: 742 queries, 8131 candidates
Processing GOOGL...
  GOOGL: 742 queries, 8131 candidates
Processing HD...
  HD: 742 queries, 8131 candidates
Processing INTC...
  INTC: 742 queries, 8131 candidates
Processing JNJ...
  JNJ: 742 queries, 8131 candidates
Processing JPM...
  JPM: 742 queries, 8131 candidates
Processing MA...
  MA: 742 queries, 8131 candidates
Processing MCD...
  MCD: 742 queries, 8131 candidates
Processing META...
  META: 742 queries, 8131 candidates
Processing MSFT...
  MSFT: 742 queries, 8131 candidates
Processing NFLX...
  NFLX: 742 queries, 8131 candidates
Processing NKE...
  NKE: 742 queries, 8131 candidates
Processing NVDA...
  NVDA: 742 qu

  df_with_movement = df.groupby('ticker').apply(label_movement).reset_index(drop=True)


  AAPL: 742 queries, 8131 candidates
Processing ADBE...
  ADBE: 742 queries, 8131 candidates
Processing AMD...
  AMD: 742 queries, 8131 candidates
Processing AMZN...
  AMZN: 742 queries, 8131 candidates
Processing BAC...
  BAC: 742 queries, 8131 candidates
Processing CRM...
  CRM: 742 queries, 8131 candidates
Processing CVX...
  CVX: 742 queries, 8131 candidates
Processing GOOGL...
  GOOGL: 742 queries, 8131 candidates
Processing HD...
  HD: 742 queries, 8131 candidates
Processing INTC...
  INTC: 742 queries, 8131 candidates
Processing JNJ...
  JNJ: 742 queries, 8131 candidates
Processing JPM...
  JPM: 742 queries, 8131 candidates
Processing MA...
  MA: 742 queries, 8131 candidates
Processing MCD...
  MCD: 742 queries, 8131 candidates
Processing META...
  META: 742 queries, 8131 candidates
Processing MSFT...
  MSFT: 742 queries, 8131 candidates
Processing NFLX...
  NFLX: 742 queries, 8131 candidates
Processing NKE...
  NKE: 742 queries, 8131 candidates
Processing NVDA...
  NVDA: 742 qu