<a href="https://colab.research.google.com/github/anirbanghoshsbi/.github.io/blob/master/work/temp_hosting/feature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install vecstack==0.4.0 -q

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#from xgboost import XGBClassifier
from vecstack import stacking
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd

from statsmodels.tsa.stattools import adfuller

data = pd.read_csv('https://raw.githubusercontent.com/anirbanghoshsbi/.github.io/master/work/Composite_data.csv')

In [3]:
data_ffill=data.ffill()

In [4]:
data_ffill.set_index('date',inplace=True)

In [5]:
data_ffill.tail(2)

Unnamed: 0_level_0,FIICash,DIIcash,Percentage_above_5_dma,Percentage_above_10_dma,Above_15_dma,Percentage_above_20_dma,VIX,GoldPrice,IN10YR,IN10YR_yield,open,high,low,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
29-04-2024,169.0,692.0,72,64,58,54,12.2125,61.299999,99.871,7.2,22475.551,22655.801,22441.9,22643.4
30-04-2024,1072.0,1430.0,62,62,60,60,12.87,60.880001,99.888,7.195,22679.65,22783.35,22568.4,22604.85


In [6]:
data_ffill.columns

Index(['FIICash', 'DIIcash', 'Percentage_above_5_dma',
       'Percentage_above_10_dma', 'Above_15_dma', 'Percentage_above_20_dma',
       'VIX', 'GoldPrice', 'IN10YR', 'IN10YR_yield', 'open', 'high', 'low',
       'close'],
      dtype='object')

In [7]:
def create_moving_average(df,windows,feature_name):
   for window in windows:
      column_name = f'{feature_name}MA_{window}'
      df[column_name] = df[feature_name].rolling(window=window).mean()
      column_name = f'{feature_name}ROC_{window}'
      df[column_name] = (df[feature_name] - df[feature_name].shift(1)) / df[feature_name].shift(1)

   return df


def create_lagged_features(df, lag, feature_name):
    """
    Create lagged versions of a variable in a DataFrame.

    Parameters:
        df (DataFrame): The DataFrame containing the variable.
        feature_name (str): The name of the column for which lagged versions will be created.
        lag (int): The number of lags to create.

    Returns:
        DataFrame: The DataFrame with lagged features added.
    """
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()

    # Create lagged versions of the variable
    for i in range(1, lag + 1):
        df_copy[f'{feature_name}_lag_{i}'] = df_copy[feature_name].shift(i)

    # Drop rows with NaN values introduced by shifting
    df_copy.dropna(inplace=True)

    return df_copy




def calculate_macd(df,feature_name,short_window=12, long_window=26, signal_window=9):
    """
    Calculate the Moving Average Convergence Divergence (MACD) indicator for a DataFrame.

    Parameters:
        df (DataFrame): The DataFrame containing the price data.
        short_window (int): The short moving average window (default: 12).
        long_window (int): The long moving average window (default: 26).
        signal_window (int): The signal line window (default: 9).

    Returns:
        DataFrame: The DataFrame with MACD values added.
    """
    # Calculate short and long Exponential Moving Averages (EMAs)
    short_ema = df[feature_name].ewm(span=short_window, min_periods=1, adjust=False).mean()
    long_ema = df[feature_name].ewm(span=long_window, min_periods=1, adjust=False).mean()

    # Calculate MACD line
    macd_line = short_ema - long_ema

    # Calculate signal line
    signal_line = macd_line.ewm(span=signal_window, min_periods=1, adjust=False).mean()

    # Calculate MACD histogram
    macd_histogram = macd_line - signal_line

    # Add MACD values to the DataFrame
    df[f'{feature_name}_MACD_Line_{short_window}'] = macd_line
    df[f'{feature_name}_Signal_Line_{short_window}'] = signal_line
    df[f'{feature_name}_MACD_Histogram_{short_window}'] = macd_histogram

    return df




In [8]:
def calculate_historical_volatility(df,feature_name, period=14):
    """
    Calculate the historical volatility for a DataFrame using only close prices.

    Parameters:
        df (DataFrame): The DataFrame containing close prices.
        period (int): The period for calculating the historical volatility (default: 14).

    Returns:
        DataFrame: The DataFrame with historical volatility values added.
    """
    # Calculate the percentage change in close prices
    df['returns'] = df[feature_name].pct_change() * 100  # Calculate returns as percentage

    # Calculate the rolling standard deviation of the returns
    df[f'{feature_name}_historical_volatility_{period}'] = df['returns'].rolling(window=period).std()

    # Drop intermediate column
    df.drop('returns', axis=1, inplace=True)

    return df



def adf_test(df, window=20):
    """
    Calculate Augmented Dickey-Fuller (ADF) test statistics for a DataFrame.

    Parameters:
        df (DataFrame): The DataFrame containing the time series data.
        window (int): The window size for calculating rolling statistics (default: 20).

    Returns:
        DataFrame: The DataFrame with ADF test statistics added.
    """
    # Calculate rolling mean and rolling standard deviation
    rolling_mean = df['close'].rolling(window=window).mean()
    rolling_std = df['close'].rolling(window=window).std()

    # Perform ADF test
    adf_results = adfuller(df['close'])
    df['adf_test_statistic'] = adf_results[0]
    df['adf_test_p_value'] = adf_results[1]

    return df


def calculate_trend_indicators(df,feature_name,window=5):
    """
    Calculate trend indicators for a DataFrame using linear regression.

    Parameters:
        df (DataFrame): The DataFrame containing the time series data.
        window (int): The window size for linear regression (default: 20).

    Returns:
        DataFrame: The DataFrame with trend indicators added.
    """
    # Calculate linear regression coefficients
    df[f'{feature_name}_{window}_trend_slope'] = np.nan
    df[f'{feature_name}_{window}_trend_intercept'] = np.nan

    for i in range(window, len(df)):
        x = np.arange(window)
        y = df[feature_name].values[i - window:i]
        slope, intercept = np.polyfit(x, y, 1)
        df.at[df.index[i], f'{feature_name}_{window}_trend_slope'] = slope
        df.at[df.index[i], f'{feature_name}_{window}_trend_intercept'] = intercept

    return df








In [9]:
feature_name=data_ffill.columns.tolist()

In [10]:
def corr_rolling_feat_gen(df,feature_name,window_size):
    # Calculate the rolling correlation with a specified window size

    df[f'{feature_name}_Rolling_Corr_close'] = df[feature_name].rolling(window=window_size).corr(df['close'])
    return df



In [11]:
df=data_ffill

In [12]:
window_size = [5,10,15,20,25]


In [13]:
df=calculate_trend_indicators(df,'FIICash',window=5)
df=calculate_trend_indicators(df,'DIIcash',window=5)
df=calculate_trend_indicators(df,'Percentage_above_5_dma',window=5)
df=calculate_trend_indicators(df,'Percentage_above_10_dma',window=5)
df=calculate_trend_indicators(df,'Percentage_above_20_dma',window=5)
df=calculate_trend_indicators(df,'Above_15_dma',window=5)
df=calculate_trend_indicators(df,'VIX',window=5)
df=calculate_trend_indicators(df,'IN10YR',window=5)
df=calculate_trend_indicators(df,'IN10YR_yield',window=5)

df=calculate_trend_indicators(df,'FIICash',window=10)
df=calculate_trend_indicators(df,'DIIcash',window=10)
df=calculate_trend_indicators(df,'Percentage_above_5_dma',window=10)
df=calculate_trend_indicators(df,'Percentage_above_10_dma',window=10)
df=calculate_trend_indicators(df,'Percentage_above_20_dma',window=10)
df=calculate_trend_indicators(df,'Above_15_dma',window=10)
df=calculate_trend_indicators(df,'VIX',window=10)
df=calculate_trend_indicators(df,'IN10YR',window=10)
df=calculate_trend_indicators(df,'IN10YR_yield',window=10)



In [14]:
df=corr_rolling_feat_gen(df,'FIICash',10)
df=corr_rolling_feat_gen(df,'DIIcash',10)
df=corr_rolling_feat_gen(df,'Percentage_above_5_dma',10)
df=corr_rolling_feat_gen(df,'Percentage_above_10_dma',10)
df=corr_rolling_feat_gen(df,'Percentage_above_20_dma',10)
df=corr_rolling_feat_gen(df,'Above_15_dma',10)
df=corr_rolling_feat_gen(df,'VIX',10)
df=corr_rolling_feat_gen(df,'IN10YR',10)
df=corr_rolling_feat_gen(df,'IN10YR_yield',10)


In [15]:
df=calculate_historical_volatility(df,'FIICash',10)
df=calculate_historical_volatility(df,'DIIcash',10)
df=calculate_historical_volatility(df,'Percentage_above_5_dma',10)
df=calculate_historical_volatility(df,'Percentage_above_10_dma',10)
df=calculate_historical_volatility(df,'Percentage_above_20_dma',10)
df=calculate_historical_volatility(df,'Above_15_dma',10)
df=calculate_historical_volatility(df,'IN10YR',10)
df=calculate_historical_volatility(df,'IN10YR_yield',10)

df=calculate_historical_volatility(df,'FIICash',15)
df=calculate_historical_volatility(df,'DIIcash',15)
df=calculate_historical_volatility(df,'Percentage_above_5_dma',15)
df=calculate_historical_volatility(df,'Percentage_above_10_dma',15)
df=calculate_historical_volatility(df,'Percentage_above_20_dma',15)
df=calculate_historical_volatility(df,'Above_15_dma',15)
df=calculate_historical_volatility(df,'IN10YR',15)
df=calculate_historical_volatility(df,'IN10YR_yield',15)

df=calculate_historical_volatility(df,'FIICash',20)
df=calculate_historical_volatility(df,'DIIcash',20)
df=calculate_historical_volatility(df,'Percentage_above_5_dma',20)
df=calculate_historical_volatility(df,'Percentage_above_10_dma',20)
df=calculate_historical_volatility(df,'Percentage_above_20_dma',20)
df=calculate_historical_volatility(df,'Above_15_dma',20)
df=calculate_historical_volatility(df,'IN10YR',20)
df=calculate_historical_volatility(df,'IN10YR_yield',20)


In [None]:
df=calculate_macd(df,'FIICash',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'FIICash',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'FIICash',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'FIICash',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'FIICash',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'FIICash',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'FIICash',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))


df=calculate_macd(df,'DIIcash',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'DIIcash',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'DIIcash',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'DIIcash',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'DIIcash',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'DIIcash',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'DIIcash',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))


df=calculate_macd(df,'Percentage_above_5_dma',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'Percentage_above_5_dma',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'Percentage_above_5_dma',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'Percentage_above_5_dma',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'Percentage_above_5_dma',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'Percentage_above_5_dma',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'Percentage_above_5_dma',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))


df=calculate_macd(df,'Percentage_above_10_dma',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'Percentage_above_10_dma',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'Percentage_above_10_dma',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'Percentage_above_10_dma',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'Percentage_above_10_dma',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'Percentage_above_10_dma',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'Percentage_above_10_dma',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))

df=calculate_macd(df,'Percentage_above_20_dma',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'Percentage_above_20_dma',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'Percentage_above_20_dma',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'Percentage_above_20_dma',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'Percentage_above_20_dma',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'Percentage_above_20_dma',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'Percentage_above_20_dma',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))

df=calculate_macd(df,'Above_15_dma',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'Above_15_dma',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'Above_15_dma',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'Above_15_dma',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'Above_15_dma',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'Above_15_dma',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'Above_15_dma',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))

df=calculate_macd(df,'IN10YR',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'IN10YR',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'IN10YR',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'IN10YR',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'IN10YR',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'IN10YR',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'IN10YR',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))

df=calculate_macd(df,'IN10YR_yield',short_window=12, long_window=26, signal_window=9)
df=calculate_macd(df,'IN10YR_yield',short_window=int(12*0.5), long_window=int(26*0.5), signal_window=int(9*0.5))
df=calculate_macd(df,'IN10YR_yield',short_window=int(12*0.75), long_window=int(26*0.75), signal_window=int(9*0.75))

df=calculate_macd(df,'IN10YR_yield',short_window=int(12*1.25), long_window=int(26*1.25), signal_window=int(9*1.25))
df=calculate_macd(df,'IN10YR_yield',short_window=int(12*1.50), long_window=int(26*1.50), signal_window=int(9*1.25))
df=calculate_macd(df,'IN10YR_yield',short_window=int(9*1.75), long_window=int(9*1.75), signal_window=int(9*1.75))
df=calculate_macd(df,'IN10YR_yield',short_window=int(12*2), long_window=int(26*2), signal_window=int(9*2))



In [17]:
data_bfill=df.bfill()

In [18]:
features=data_bfill.columns.tolist()

In [19]:
days_out=6

In [20]:
close_groups = data_bfill['close'].transform(lambda x : np.sign(x.diff(days_out)))
data_bfill['Flag'] = close_groups

  data_bfill['Flag'] = close_groups


In [21]:
X_Cols = data_bfill[features]
Y_Cols = data_bfill['Flag'].fillna(1)
samp_train= int(len(data_bfill)*.80)

X_train = X_Cols[:samp_train]
X_test = X_Cols[samp_train:]
y_train = Y_Cols[:samp_train]
y_test = Y_Cols[samp_train:]

In [None]:
X_train[~np.isfinite(X_train)] = 0.01  # Replace infinite values with NaN
#max_values = np.nanmax(X_train, axis=0)  # Calculate maximum value for each column
#X_train[np.isinf(X_train)] = max_values[np.isinf(X_train)]  # Replace infinite values with maximum value


In [23]:
# Instantiate the RobustScaler
scaler = RobustScaler()

# Fit the scaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
models_L1 = [
    svm.SVC(kernel='linear'),
    RandomForestClassifier(random_state=0, n_jobs=-1,
                           n_estimators=1000, max_depth=2,class_weight='balanced'),
    ]

In [25]:
model_L1_0 = models_L1[0]
_ = model_L1_0.fit(X_train_scaled, y_train)
# save model in file if you need

model_L1_1 = models_L1[1]
_ = model_L1_1.fit(X_train, y_train)#sample_weight=sample_w
# save model in file if you need


In [31]:
importances = model_L1_1.feature_importances_
data = {'feature_name': features, 'importance': importances*100}
feature_importance= pd.DataFrame(data)

In [34]:
feat_importance = feature_importance.sort_values(by='importance')
feat_importance.to_csv('feat_data.csv')

In [28]:
S_train, S_test = stacking(models_L1,                     # list of models
                           X_train, y_train, X_test,   # data
                           regression=False,           # classification task (if you need
                                                       #     regression - set to True)
                           mode='oof',                 # mode: oof for train set, predict test
                                                       #mode='oof_pred_bag'(for training) and oof for inference
                                                      #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need
                                                       #     probabilities - set to True)
                           save_dir=None,              # do not save result and log (to save
                                                       #     in current dir - set to '.')
                           metric=accuracy_score,      # metric: callable
                           n_folds=2,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=False,               # shuffle the data
                           random_state=None,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof]
n_models:     [2]

model  0:     [SVC]
    fold  0:  [0.68495575]
    fold  1:  [0.57978723]
    ----
    MEAN:     [0.63237149] + [0.05258426]
    FULL:     [0.63241807]

model  1:     [RandomForestClassifier]
    fold  0:  [0.75221239]
    fold  1:  [0.76418440]
    ----
    MEAN:     [0.75819839] + [0.00598600]
    FULL:     [0.75819309]

