In [1]:
# Import packages
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Base directory where all data is stored
base_dir = 'data/data_warehouse'

# Data Cleaning and Processing

In [3]:
# Function to parse dates from folder names
def parse_date(folder_name):
    try:
        return datetime.strptime(folder_name, '%Y-%m-%d')
    except:
        return None

##### Process player results (daily stats)

In [4]:
def load_player_results():
    results_files = glob.glob(os.path.join(base_dir, '*', 'player_results.csv'))
    all_results = []
    
    print(f"Found {len(results_files)} player_results.csv files")
    
    for file in results_files:
        # Extract date from path
        date_str = file.split(os.sep)[-2]
        date = parse_date(date_str)
        
        if date:
            try:
                df = pd.read_csv(file)
                
                # Add date information
                df['Date'] = date
                
                # Convert numeric columns if needed
                numeric_cols = ['Salary', 'Floor', 'Ceiling', 'FPG', 'FPPM', 'USG', 'FGA', 'MPG', 
                               'Proj Mins', 'FC Proj', 'My Proj', 'Mins', 'Score', 'Val']
                
                for col in numeric_cols:
                    if col in df.columns:
                        df[col] = pd.to_numeric(df[col], errors='coerce')
                
                # Add to results
                all_results.append(df)
            except Exception as e:
                print(f"Error processing {file}: {e}")
    
    # Combine all data
    combined_results = pd.concat(all_results, ignore_index=True)
    
    return combined_results

##### Process contest standings and player selection stats

In [5]:
def load_contest_data():
    standings_files = glob.glob(os.path.join(base_dir, '*', '*', 'contest-standings.csv'))
    all_entries = []
    all_player_records = []
    
    print(f"Found {len(standings_files)} contest-standings.csv files")
    
    for file in standings_files:
        try:
            # Extract date and contest name from path
            path_parts = file.split(os.sep)
            date_str = path_parts[-3]
            contest_name = path_parts[-2]
            date = datetime.strptime(date_str, '%Y-%m-%d')
            
            # Load the entire CSV
            df = pd.read_csv(file, low_memory=False)
            
            # The 'X' column separates the two tables
            # Make sure we get the right column index in case column order varies
            x_idx = df.columns.get_loc('X') if 'X' in df.columns else 6
            
            # Split into two separate tables
            entries_df = df.iloc[:, :x_idx].copy()  # Columns to the left of 'X'
            players_df = df.iloc[:, x_idx+1:].copy()  # Columns to the right of 'X'
            
            # Clean entries data - drop rows without entry information
            entries_df.dropna(subset=['EntryId'], inplace=True)
            entries_df['Date'] = date
            entries_df['Contest'] = contest_name
            
            # Extract contest type
            if 'H2H' in contest_name or 'Head-to-Head' in contest_name:
                contest_type = 'Head-to-Head'
            elif 'Double Up' in contest_name or '50/50' in contest_name:
                contest_type = 'Double Up'
            elif 'GPP' in contest_name or 'Tournament' in contest_name:
                contest_type = 'Tournament'
            else:
                contest_type = 'Other'
                
            entries_df['ContestType'] = contest_type
            
            # Extract entry fee if possible
            try:
                entry_fee = float(contest_name.split('$')[1].split('entry')[0])
                entries_df['EntryFee'] = entry_fee
            except:
                entries_df['EntryFee'] = None
            
            # Process player records
            # First, drop rows with no player information
            players_df.dropna(subset=['Player'], inplace=True)
            
            # Add identifying information
            players_df['Date'] = date
            players_df['Contest'] = contest_name
            players_df['ContestType'] = contest_type
            
            # Add these records to our lists
            all_entries.append(entries_df)
            all_player_records.append(players_df)
            
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    # Combine all entry data
    combined_entries = pd.concat(all_entries, ignore_index=True)
    
    # Combine all player record data
    combined_player_records = pd.concat(all_player_records, ignore_index=True)
     
    return combined_entries, combined_player_records

##### Process player projections

In [6]:
def load_player_projections():
    player_files = glob.glob(os.path.join(base_dir, '*', '*', 'players.csv'))
    all_players = []
    
    print(f"Found {len(player_files)} players.csv files")
    
    for file in player_files:
        try:
            path_parts = file.split(os.sep)
            date_str = path_parts[-3]
            contest_name = path_parts[-2]
            date = datetime.strptime(date_str, '%Y-%m-%d')
            
            df = pd.read_csv(file)
            
            # Parse game info into useful components
            if 'GameInfo' in df.columns:
                # Example format: "MIL@CHI 01/28/2021"
                df['HomeTeam'] = df['GameInfo'].str.split('@').str[1].str.split(' ').str[0]
                df['AwayTeam'] = df['GameInfo'].str.split('@').str[0]
                df['IsHome'] = df['Team'] == df['HomeTeam']
            
            # Convert numeric columns
            numeric_cols = ['Salary', 'AvgPointsPerGame', 'Projection', 'Projection_dfn', 'Actual_fpts']
            for col in numeric_cols:
                if col in df.columns:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Add metadata
            df['Date'] = date
            df['Contest'] = contest_name
            
            all_players.append(df)
            
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    # Combine all player data
    combined_players = pd.concat(all_players, ignore_index=True)

    return combined_players

##### Process all data into csv files

In [7]:
# Main processing script
def process_all_data():
    print("Starting data processing...")
    
    # Load all datasets
    player_results = load_player_results()
    entries_df, player_records = load_contest_data()
    player_projections = load_player_projections()
    
    # Save processed datasets
    player_results.to_csv('processed_player_results.csv', index=False)
    entries_df.to_csv('processed_contest_entries.csv', index=False)
    player_records.to_csv('processed_player_selections.csv', index=False)
    player_projections.to_csv('processed_player_projections.csv', index=False)

    print("Data processing complete!")
    print(f"Saved {len(player_results)} player results")
    print(f"Saved {len(entries_df)} contest entries")
    print(f"Saved {len(player_records)} player selections")
    print(f"Saved {len(player_projections)} player projections")
    
    return {
        'player_results': player_results,
        'contest_entries': entries_df,
        'player_selections': player_records,
        'player_projections': player_projections
    }

# Run the processing
data = process_all_data()

Starting data processing...
Found 6 player_results.csv files
Found 86 contest-standings.csv files
Found 86 players.csv files
Data processing complete!
Saved 931 player results
Saved 1394152 contest entries
Saved 10981 player selections
Saved 16307 player projections
