<a href="https://colab.research.google.com/github/akinahomwabella/Akinahom-Portfolio/blob/main/Financial%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Step 1: Retrieve Financial Data
def get_financial_data(tickers, start_date, end_date):
    data = {}
    for ticker in tickers:
        df = yf.download(ticker, start=start_date, end=end_date)
        df['Ticker'] = ticker
        data[ticker] = df
    return pd.concat(data.values())

# Step 2: Introduce Missing Values
def introduce_missing_values(df, column, missing_ratio):
    np.random.seed(42)
    missing_indices = np.random.choice(df.index, size=int(len(df) * missing_ratio), replace=False)
    df.loc[missing_indices, column] = np.nan
    return df

# Step 3: Interpolation Methods (Updated to Handle NaN Robustly)
def apply_interpolation(df, method, column):
    if method == 'linear':
        return df[column].interpolate(method='linear')
    elif method == 'spline':
        return df[column].interpolate(method='spline', order=3)
    elif method == 'polynomial':
        return df[column].interpolate(method='polynomial', order=3)
    elif method == 'knn':
        # Check if the column has at least one non-NaN value
        if df[column].notna().sum() > 0:
            imputer = KNNImputer(n_neighbors=5)
            imputed_values = imputer.fit_transform(df[[column]])
            return pd.Series(imputed_values.flatten(), index=df.index)
        else:
            print(f"Skipping KNN interpolation for {column} due to insufficient valid data.")
            return df[column]  # Return as-is if no valid data
    elif method == 'lagrange':
        return df[column].interpolate(method='polynomial', order=min(3, len(df) - 1))
    else:
        raise ValueError("Invalid interpolation method")


# Step 4: Evaluate Interpolation Methods
def evaluate_methods(df, column, original_values):
    metrics = {}
    methods = ['linear', 'spline', 'polynomial', 'knn', 'lagrange']
    results = {}

    for method in methods:
        # Create a copy of the DataFrame
        interpolated_values = apply_interpolation(df.copy(), method, column)
        results[method.capitalize()] = interpolated_values

        # Evaluate the imputation quality
        mae = mean_absolute_error(original_values, interpolated_values)
        rmse = np.sqrt(mean_squared_error(original_values, interpolated_values))
        metrics[method.capitalize()] = {'MAE': mae, 'RMSE': rmse}

    return metrics, results

# Step 5: Visualize Results
def plot_results(metrics, title):
    methods = list(metrics.keys())
    mae_values = [metrics[method]['MAE'] for method in methods]
    rmse_values = [metrics[method]['RMSE'] for method in methods]

    x = np.arange(len(methods))
    width = 0.35

    plt.figure(figsize=(10, 6))
    plt.bar(x - width/2, mae_values, width, label='MAE')
    plt.bar(x + width/2, rmse_values, width, label='RMSE')

    plt.xlabel('Interpolation Methods')
    plt.ylabel('Error Metrics')
    plt.title(title)
    plt.xticks(x, methods)
    plt.legend()
    plt.show()

# Step 6: Machine Learning Evaluation
def train_ml_models(df, feature_column, target_column):
    df = df.dropna()
    X = df[[feature_column]].values
    y = df[target_column].values

    rf_model = RandomForestRegressor()
    gb_model = GradientBoostingRegressor()

    rf_model.fit(X, y)
    gb_model.fit(X, y)

    rf_predictions = rf_model.predict(X)
    gb_predictions = gb_model.predict(X)

    rf_mae = mean_absolute_error(y, rf_predictions)
    rf_rmse = np.sqrt(mean_squared_error(y, rf_predictions))

    gb_mae = mean_absolute_error(y, gb_predictions)
    gb_rmse = np.sqrt(mean_squared_error(y, gb_predictions))

    return {
        'Random Forest': {'MAE': rf_mae, 'RMSE': rf_rmse},
        'Gradient Boosting': {'MAE': gb_mae, 'RMSE': gb_rmse}
    }
# Main Script
tickers = ['AAPL', 'TSLA', 'META', 'MSFT']
data = get_financial_data(tickers, '2021-01-01', '2024-01-01')

# Focus on a specific ticker (e.g., TSLA)
tsla_data = data[data['Ticker'] == 'TSLA']

# Save original values for comparison BEFORE introducing missing values
original_values = tsla_data['Close'].copy()

# Introduce missing values
tsla_data = introduce_missing_values(tsla_data, 'Close', 0.1)

# Check if there is sufficient non-NaN data for comparison
if original_values.isna().sum() == len(original_values):
    raise ValueError("Original values contain all NaNs, cannot evaluate interpolation methods.")

# Apply interpolation methods and evaluate
methods = ['linear', 'spline', 'polynomial', 'knn', 'lagrange']
metrics, interpolated_results = evaluate_methods(tsla_data.copy(), 'Close', original_values)

# Add interpolated columns back to the DataFrame
for method, values in interpolated_results.items():
    tsla_data[method] = values

# Plot the results
plot_results(metrics, "Interpolation Method Performance (TSLA)")

# Machine Learning Evaluation
ml_metrics = train_ml_models(tsla_data.dropna(), 'Linear', 'Close')
print("Machine Learning Metrics:\n", ml_metrics)



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().