In [1]:
# Install required packages (run once)
# !pip install lseg-data pandas numpy scipy scikit-learn matplotlib seaborn

# Import libraries
import lseg.data as ld
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


### Connect to LSEG Data

In [2]:
# Open LSEG session
ld.open_session(app_key="YOUR API")

<lseg.data.session.Definition object at 0x16ccc4250 {name='workspace'}>

### Get S&P 500 Universe

In [4]:
# Get S&P 500 constituents
print("Fetching S&P 100 constituents...")

constituents = ld.get_data(
    universe=['0#.OEX'],
    fields=['TR.CommonName', 'TR.GICSSector', 'TR.GICSIndustry']
)

# Clean the data
constituents = constituents.dropna()
constituents.columns = ['ticker', 'company_name', 'sector', 'industry']

print(f"\nRetrieved {len(constituents)} stocks")
print("\nFirst 5 stocks:")
print(constituents.head())

# Get list of tickers
tickers = constituents['ticker'].tolist()

Fetching S&P 100 constituents...

Retrieved 101 stocks

First 5 stocks:
     ticker                          company_name                  sector  \
0   GILD.OQ                   Gilead Sciences Inc             Health Care   
1     MRK.N                        Merck & Co Inc             Health Care   
2      PM.N       Philip Morris International Inc        Consumer Staples   
3     IBM.N  International Business Machines Corp  Information Technology   
4  GOOGL.OQ                          Alphabet Inc  Communication Services   

                       industry  
0                 Biotechnology  
1               Pharmaceuticals  
2                       Tobacco  
3                   IT Services  
4  Interactive Media & Services  


### Define Date Range

In [6]:
# Define date range (1 years of data)
end_date = datetime.now()
start_date = end_date - timedelta(days=365*2)

print(f"Data range: {start_date.date()} to {end_date.date()}")

Data range: 2023-12-20 to 2025-12-19


### Get Historical Price Data

In [7]:
# Get historical prices
print("Fetching historical prices...")

prices = ld.get_history(
    universe=tickers,  # Same 100 stocks
    fields='TR.PriceClose',
    interval='daily',
    start=start_date.strftime('%Y-%m-%d'),
    end=end_date.strftime('%Y-%m-%d')
)

print(f"\nPrice data shape: {prices.shape}")
print("\nFirst few rows:")


Fetching historical prices...

Price data shape: (501, 101)

First few rows:


### Get Returns Data

In [8]:
# Calculate returns
print("Fetching price data to calculate returns...")


# Calculate returns from prices
returns = prices.pct_change()

# Remove any remaining NaN columns (stocks with no data)
returns = returns.dropna(axis=1, how='all')

print(f"\nReturns shape after cleaning: {returns.shape}")
print(f"Date range: {returns.index[0]} to {returns.index[-1]}")
print(f"Number of trading days: {len(returns)}")
# print("\nFirst few rows:")
# print(returns.head())
# print("\nReturns statistics:")
# print(returns.describe())

Fetching price data to calculate returns...

Returns shape after cleaning: (501, 101)
Date range: 2023-12-20 00:00:00 to 2025-12-18 00:00:00
Number of trading days: 501


### Helper Functions for Factor Construction

In [9]:
def winsorize(series, lower=0.01, upper=0.99):
    """Remove extreme outliers"""
    return series.clip(
        lower=series.quantile(lower),
        upper=series.quantile(upper)
    )

def standardize(series):
    """Z-score standardization (mean=0, std=1)"""
    return (series - series.mean()) / series.std()

print("Helper functions defined!")

Helper functions defined!
