# Data download and complete the data set by interpolation

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

# Step 1: Load tickers from the specified CSV
tickers_df = pd.read_csv('/Users/xiannvying/Desktop/3A/FINA/Weefin_gestion_portefeuille/projet/portfolio.csv')
tickers = tickers_df['Symbol'].dropna().tolist()

# Step 2: Download price of stocks
data = yf.download(tickers, start='2015-01-01', end='2023-01-01')['Adj Close']

# Step 3: Time index for stocks
data.index = data.index.strftime('%Y-%m-%d')

# Step 4: Remove stocks with more than 10% missing data
threshold = 0.1  # 10% missing data threshold
max_missing_allowed = int(threshold * len(data))
data_filtered = data.loc[:, data.isnull().sum() <= max_missing_allowed]
data_final = data_filtered.dropna()

# Step 5: Interpolate the remaining data
data_interpolated = data_filtered.interpolate(method='linear', axis=0)

# Step 6: Verify if there is any remaining missing data
remaining_missing = data_interpolated.isnull().sum().sum()
if remaining_missing == 0:
    print("All data complete after filtering and interpolation!")
else:
    print(f"{remaining_missing} missing values remain after processing.")




# Final Step: Print the processed data
print(data_interpolated)
data_stock = data_interpolated

[**********************67%*******                ]  139 of 206 completed

# Bond data

In [None]:
import yfinance as yf

# Example: Fetching US 10-Year Treasury bond yield (^TNX)
bond = yf.Ticker("^TNX")
data_bond = bond.history(period="10y")
data_bond.index = data_bond.index.strftime('%Y-%m-%d')

# Calculate average yield
average_yield = data_bond['Close'].mean()
print(f"Average Yield (10 Year): {average_yield}%")

# Plot yield trend
import matplotlib.pyplot as plt
data_bond['Close'].plot(title="US 10-Year Treasury Bond Yield")
plt.xlabel("Date")
plt.ylabel("Yield (%)")
plt.show()

data_bond = data_bond['Close']

# Benchmark SP500 data

In [None]:
index = ['^GSPC']  # S&P 500 index
data_sp500 = yf.download(index, start='2015-01-01', end='2023-01-01')['Adj Close']
data_sp500.index = data_sp500.index.strftime('%Y-%m-%d')

## traitement

In [None]:
# Align the date range
common_start_date = max(data_stock.index.min(), data_bond.index.min())
common_end_date = min(data_stock.index.max(), data_bond.index.max())
data_stock = data_stock.loc[common_start_date:common_end_date]
data_bond = data_bond.loc[common_start_date:common_end_date]

# Handle missing values
data_stock = data_stock.interpolate(method='linear')
data_bond = data_bond.interpolate(method='linear')

# Ensure alignment along the index
data_stock, data_bond = data_stock.align(data_bond, join='inner', axis=0)

# Verify alignment
assert data_stock.shape[0] == data_bond.shape[0], "Datasets are not aligned!"
assert all(data_stock.index == data_bond.index), "Indices do not match!"

print(f"Aligned Data Shape: {data_stock.shape}")


In [None]:
# Align the date range for all datasets
common_start_date = max(data_stock.index.min(), data_bond.index.min(), data_sp500.index.min())
common_end_date = min(data_stock.index.max(), data_bond.index.max(), data_sp500.index.max())
data_stock = data_stock.loc[common_start_date:common_end_date]
data_bond = data_bond.loc[common_start_date:common_end_date]
data_sp500 = data_sp500.loc[common_start_date:common_end_date]

# Handle missing values
data_stock = data_stock.interpolate(method='linear')
data_bond = data_bond.interpolate(method='linear')
data_sp500 = data_sp500.interpolate(method='linear')

# Ensure alignment along the index for all three datasets
data_stock, data_bond = data_stock.align(data_bond, join='inner', axis=0)
data_stock, data_sp500 = data_stock.align(data_sp500, join='inner', axis=0)

# Verify alignment
assert data_stock.shape[0] == data_bond.shape[0] == data_sp500.shape[0], "Datasets are not aligned!"
assert all(data_stock.index == data_bond.index) and all(data_stock.index == data_sp500.index), "Indices do not match!"

print(f"Aligned Data Shape: {data_stock.shape}")
