# MSCI Indices - Download and Fill Missing Days

This notebook downloads MSCI index data from the GitHub repository and fills missing days using backward fill (bfill).


In [None]:
import pandas as pd
import os
import requests
from io import StringIO
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
# Change to project root directory
# Find the project root by looking for the 'data' directory
current_dir = os.getcwd()
while not os.path.exists(os.path.join(current_dir, 'data')):
    parent_dir = os.path.dirname(current_dir)
    if parent_dir == current_dir:
        # Reached filesystem root without finding 'data' directory
        raise FileNotFoundError("Could not find project root directory (looking for 'data' folder)")
    current_dir = parent_dir

os.chdir(current_dir)
print(f"Working directory: {os.getcwd()}")

In [None]:
# GitHub repository base URL
github_base_url = "https://raw.githubusercontent.com/amedeos/Stock-Indexes-Historical-Data/main"

# Define the files to download
files_to_download = {
    'ACWI': 'DAILY/NET/EUR/ALL-COUNTRY-DM-EM/Region/NONE/NONE/STANDARD-LARGE-MID-CAP/ACWI.csv',
    'WORLD': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Region/NONE/NONE/STANDARD-LARGE-MID-CAP/WORLD.csv',
    'ITALY': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/ITALY.csv',
    'EUROPE': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Region/NONE/NONE/STANDARD-LARGE-MID-CAP/EUROPE.csv',
    'EMU': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Region/NONE/NONE/STANDARD-LARGE-MID-CAP/EMU.csv',
    'GERMANY': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/GERMANY.csv',
    'FRANCE': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/FRANCE.csv',
    'SPAIN': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/SPAIN.csv',
    'UK': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/UK.csv',
    'NETHERLANDS': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/NETHERLANDS.csv',
    'SWEDEN': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/SWEDEN.csv',
    'DENMARK': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/DENMARK.csv',
    'NORWAY': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/NORWAY.csv',
    'FINLAND': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/FINLAND.csv',
    'SWITZERLAND': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/SWITZERLAND.csv',
    'AUSTRIA': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/AUSTRIA.csv',
    'BELGIUM': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/BELGIUM.csv',
    'PORTUGAL': 'DAILY/NET/EUR/DEVELOPED-MARKETS-DM/Country/NONE/NONE/STANDARD-LARGE-MID-CAP/PORTUGAL.csv'
}


In [None]:
# Output directory
output_dir = 'data/msci'
os.makedirs(output_dir, exist_ok=True)

print(f"Output directory: {output_dir}")
print(f"\nFiles to download:")
for name, path in files_to_download.items():
    print(f"  {name}: {path}")

In [None]:
# Download and process each file
for index_name, file_path in files_to_download.items():
    print(f"\n{'='*60}")
    print(f"Processing {index_name}...")
    print(f"{'='*60}")
    
    # Construct the full URL
    file_url = f"{github_base_url}/{file_path}"
    print(f"Downloading from: {file_url}")
    
    # Download the file
    try:
        response = requests.get(file_url)
        response.raise_for_status()
        print(f"Download successful (size: {len(response.content)} bytes)")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        continue
    
    # Read CSV into pandas
    df = pd.read_csv(StringIO(response.text))
    print(f"\nOriginal data shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Identify the date column (common names: Date, DATE, date, TIME_PERIOD, etc.)
    date_col = None
    for col in df.columns:
        if col.lower() in ['date', 'time_period', 'time', 'timestamp']:
            date_col = col
            break
    
    if date_col is None:
        print(f"Warning: Could not identify date column. Assuming first column is date.")
        date_col = df.columns[0]
    
    print(f"\nUsing '{date_col}' as date column")
    
    # Convert date column to datetime
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    
    # Check for missing dates
    print(f"\nDate range: {df[date_col].min()} to {df[date_col].max()}")
    print(f"Total rows: {len(df)}")
    
    # Create a complete date range (daily)
    date_range = pd.date_range(start=df[date_col].min(), end=df[date_col].max(), freq='D')
    print(f"Expected daily rows: {len(date_range)}")
    print(f"Missing days: {len(date_range) - len(df)}")
    
    # Set date as index
    df_indexed = df.set_index(date_col)
    
    # Reindex to include all days
    df_complete = df_indexed.reindex(date_range)
    
    # Apply backward fill (bfill) to fill missing days
    df_filled = df_complete.bfill()
    
    # Reset index to have date as a column again
    df_filled = df_filled.reset_index()
    df_filled = df_filled.rename(columns={'index': date_col})
    
    print(f"\nAfter filling missing days:")
    print(f"Total rows: {len(df_filled)}")
    print(f"Missing values per column:")
    print(df_filled.isnull().sum())
    
    # Save to output directory
    output_file = os.path.join(output_dir, f"{index_name}.csv")
    df_filled.to_csv(output_file, index=False)
    print(f"\nSaved to: {output_file}")
    print(f"Final shape: {df_filled.shape}")
    
    # Identify the value column (first numeric column that is not the date)
    value_col = None
    for col in df_filled.columns:
        if col != date_col and pd.api.types.is_numeric_dtype(df_filled[col]):
            value_col = col
            break
    
    if value_col is None:
        print(f"Warning: Could not identify value column for plotting.")
    else:
        # Create plot
        plt.figure(figsize=(12, 6))
        plt.plot(df_filled[date_col], df_filled[value_col], linewidth=1.5)
        plt.title(f'MSCI {index_name} Index - Historical Performance', fontsize=14, fontweight='bold')
        plt.xlabel('Date', fontsize=12)
        plt.ylabel(f'{value_col}', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        
        # Format x-axis dates
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        plt.gca().xaxis.set_major_locator(mdates.YearLocator())
        plt.xticks(rotation=45)
        
        # Save plot
        plot_file = os.path.join(output_dir, f"{index_name}_plot.png")
        plt.savefig(plot_file, dpi=150, bbox_inches='tight')
        print(f"Plot saved to: {plot_file}")
        
        plt.show()
