In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np

def get_proportional_samples(data_path='./Data/', total_samples=200):
    # Get all CSV files in the data directory
    csv_files = list(Path(data_path).glob('*.csv'))
    
    # Dictionary to store DataFrames
    dfs = {}
    total_records = 0
    
    # Read all CSV files and count total records
    for file in csv_files:
        df = pd.read_csv(file)
        dfs[file.name] = df
        total_records += len(df)
    
    # Calculate proportional samples for each file
    final_samples = pd.DataFrame()
    
    for file_name, df in dfs.items():
        # Calculate proportion of samples for this file
        file_proportion = len(df) / total_records
        file_samples = int(total_samples * file_proportion)
        
        # Group by date and get proportional samples
        if 'Date' in df.columns:
            date_groups = df.groupby('Date')
            date_proportions = date_groups.size() / len(df)
            
            # Calculate samples per date
            date_samples = (date_proportions * file_samples).round().astype(int)
            
            # Adjust for rounding errors
            while date_samples.sum() > file_samples:
                date_samples[date_samples.argmax()] -= 1
            while date_samples.sum() < file_samples:
                date_samples[date_samples.argmin()] += 1
            
            # Sample from each date group
            temp_samples = pd.DataFrame()
            for date, n_samples in date_samples.items():
                if n_samples > 0:
                    group_sample = date_groups.get_group(date).sample(
                        n=min(n_samples, len(date_groups.get_group(date))),
                        random_state=42
                    )
                    temp_samples = pd.concat([temp_samples, group_sample])
        else:
            # If no Date column, sample proportionally from the whole file
            temp_samples = df.sample(n=file_samples, random_state=42)
        
        final_samples = pd.concat([final_samples, temp_samples])
    
    return final_samples.reset_index(drop=True)

# Usage example
sampled_data = get_proportional_samples(data_path='./Data/', total_samples=200)
print(f"Total samples: {len(sampled_data)}")
print("\nSamples per file:")
print(sampled_data.groupby('source_file').size() if 'source_file' in sampled_data.columns else "No source file column")


Total samples: 196

Samples per file:
No source file column


  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1
  date_samples[date_samples.argmin()] += 1


In [None]:
def write_samples_to_file(df, filename='200_full_text_samples.txt'):
    with open(filename, 'w', encoding='utf-8') as f:
        for idx, text in enumerate(df['Full_text'], 1):
            f.write(f"Sample {idx}:\n")
            f.write("="*50 + "\n\n")
            f.write(str(text))
            f.write("\n\n")
            f.write("="*50 + "\n\n\n")

# Write the samples
write_samples_to_file(sampled_data)

: 

In [2]:
sampled_data.to_csv('200_full_text_samples.csv', index=False)
