In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,confusion_matrix,classification_report
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [119]:
trades=pd.read_csv("D:\Datascience Task\project market trading\data\historical_data.csv")
sentiment = pd.read_csv("data/fear_greed_index.csv")

In [120]:
sentiment.head()

Unnamed: 0,timestamp,value,classification,date
0,1517463000,30,Fear,2018-02-01
1,1517549400,15,Extreme Fear,2018-02-02
2,1517635800,40,Fear,2018-02-03
3,1517722200,24,Extreme Fear,2018-02-04
4,1517808600,11,Extreme Fear,2018-02-05


In [121]:
sentiment['date']=pd.to_datetime(sentiment['date'])
trades['Timestamp IST'] = pd.to_datetime(
    trades['Timestamp IST'],
    format="%d-%m-%Y %H:%M",
    errors='coerce'
)
trades['date'] = trades['Timestamp IST'].dt.floor('D')


In [122]:
# Long / Short flag
trades['is_long'] = (trades['Side'].str.upper() == 'BUY').astype(int)

# Win flag
trades['is_win'] = (trades['Closed PnL'] > 0).astype(int)

daily_trader_metrics = trades.groupby(
    ['date', 'Account']
).agg(
    daily_pnl=('Closed PnL', 'sum'),
    win_rate=('is_win', 'mean'),
    avg_trade_size_usd=('Size USD', 'mean'),
    trades_per_day=('Trade ID', 'count'),
    long_ratio=('is_long', 'mean')
).reset_index()

daily_trader_metrics.head()


Unnamed: 0,date,Account,daily_pnl,win_rate,avg_trade_size_usd,trades_per_day,long_ratio
0,2023-05-01,0x3998f134d6aaa2b6a5f723806d00fd2bbbbce891,0.0,0.0,159.0,3,1.0
1,2023-12-05,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,0.0,0.0,5556.203333,9,0.777778
2,2023-12-14,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,-205.434737,0.363636,10291.213636,11,0.454545
3,2023-12-15,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,-24.632034,0.0,5304.975,2,1.0
4,2023-12-16,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,0.0,0.0,5116.256667,3,1.0


In [123]:
daily_trader_metrics = daily_trader_metrics.merge(
    sentiment[['date', 'classification']],
    on='date',
    how='inner'
)
daily_trader_metrics

Unnamed: 0,date,Account,daily_pnl,win_rate,avg_trade_size_usd,trades_per_day,long_ratio,classification
0,2023-05-01,0x3998f134d6aaa2b6a5f723806d00fd2bbbbce891,0.000000,0.000000,159.000000,3,1.000000,Greed
1,2023-12-05,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,0.000000,0.000000,5556.203333,9,0.777778,Extreme Greed
2,2023-12-14,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,-205.434737,0.363636,10291.213636,11,0.454545,Greed
3,2023-12-15,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,-24.632034,0.000000,5304.975000,2,1.000000,Greed
4,2023-12-16,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,0.000000,0.000000,5116.256667,3,1.000000,Greed
...,...,...,...,...,...,...,...,...
2335,2025-05-01,0xa0feb3725a9335f49874d7cd8eaad6be45b27416,1449.529436,0.350694,1130.887083,288,0.649306,Neutral
2336,2025-05-01,0xb1231a4a2dd02f2276fa3c5e2a2f3436e6bfed23,102460.171640,0.780822,3462.110685,73,0.780822,Neutral
2337,2025-05-01,0xbaaaf6571ab7d571043ff1e313a9609a10637864,1.860320,1.000000,3.900000,1,0.000000,Neutral
2338,2025-05-01,0xbd5fead7180a9c139fa51a103cb6a2ce86ddb5c3,-113601.020138,0.008621,16681.467759,116,0.655172,Neutral


In [124]:
# Sort properly
daily_trader_metrics = daily_trader_metrics.sort_values(
    ['Account', 'date']
)

# Next-day PnL
daily_trader_metrics['next_day_pnl'] = daily_trader_metrics.groupby(
    'Account'
)['daily_pnl'].shift(-1)

# Binary target
daily_trader_metrics['next_day_profitable'] = (
    daily_trader_metrics['next_day_pnl'] > 0
).astype(int)

# Drop last day per trader (no next-day label)
model_df = daily_trader_metrics.dropna()


In [125]:
model_df

Unnamed: 0,date,Account,daily_pnl,win_rate,avg_trade_size_usd,trades_per_day,long_ratio,classification,next_day_pnl,next_day_profitable
511,2024-11-11,0x083384f897ee0f19899168e3b1bec365f52a9012,0.000000,0.000000,5089.718249,177,0.000000,Extreme Greed,0.000000,0
534,2024-11-17,0x083384f897ee0f19899168e3b1bec365f52a9012,0.000000,0.000000,7976.664412,68,0.000000,Extreme Greed,0.000000,0
538,2024-11-18,0x083384f897ee0f19899168e3b1bec365f52a9012,0.000000,0.000000,23734.500000,40,0.000000,Extreme Greed,-21227.000000,0
560,2024-11-22,0x083384f897ee0f19899168e3b1bec365f52a9012,-21227.000000,0.000000,28186.666667,12,1.000000,Extreme Greed,1603.100000,1
575,2024-11-26,0x083384f897ee0f19899168e3b1bec365f52a9012,1603.100000,0.444444,17248.148148,27,0.444444,Extreme Greed,-132271.000000,0
...,...,...,...,...,...,...,...,...,...,...
2296,2025-04-26,0xbee1707d6b44d4d52bfe19e41f8a828645437aab,18.390524,0.068966,1771.744828,29,1.000000,Greed,1709.194807,1
2305,2025-04-27,0xbee1707d6b44d4d52bfe19e41f8a828645437aab,1709.194807,0.390244,2153.859390,82,0.609756,Greed,4008.588908,1
2313,2025-04-28,0xbee1707d6b44d4d52bfe19e41f8a828645437aab,4008.588908,0.393023,1495.538419,430,0.381395,Neutral,8561.771838,1
2320,2025-04-29,0xbee1707d6b44d4d52bfe19e41f8a828645437aab,8561.771838,0.441242,1939.739989,902,0.478936,Greed,2520.773814,1


In [156]:

le = LabelEncoder()
model_df['sentiment_encoded'] = le.fit_transform(
    model_df['classification']
)

features = [
    'win_rate',
    'avg_trade_size_usd',
    'trades_per_day',
    'long_ratio',
    'sentiment_encoded'
]

X = model_df[features]
y = model_df['next_day_profitable']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['sentiment_encoded'] = le.fit_transform(


In [179]:
scaler.feature_names_in_

array(['win_rate', 'avg_trade_size_usd', 'trades_per_day', 'long_ratio',
       'sentiment_encoded'], dtype=object)

In [158]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [160]:
models={
    'Logistic Regression':LogisticRegression(),
    'SVC':SVC(),
    'DecisionTree Classifier':DecisionTreeClassifier(),
    'RandomForest Classifier':RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    # "XGB Classifier":XGBClassifier(),
    # "CatBoost Classifier":CatBoostClassifier()
}

In [161]:
param = {

    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10],
        "solver": ["liblinear", "lbfgs"],
        "penalty": ["l2"],
        "max_iter": [1000]
    },

    "SVC": {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"],
        "probability": [True]
    },

    "DecisionTree Classifier": {
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
        "criterion": ["gini", "entropy"]
    },

    "RandomForest Classifier": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },

    "AdaBoostClassifier": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1.0]
    },

    # "XGB Classifier" : {
    #     "n_estimators": [100, 200, 300],
    #     "max_depth": [3, 5, 7],
    #     "learning_rate": [0.01, 0.05, 0.1],
    #     "subsample": [0.7, 0.8, 1.0],
    #     "colsample_bytree": [0.7, 0.8, 1.0],
    #     "gamma": [0, 0.1, 0.3],
    #     "reg_alpha": [0, 0.1, 1],
    #     "reg_lambda": [1, 1.5, 2]
    # },
    # "CatBoost Classifier":{
    #     "iterations": [200, 400, 600],
    #     "depth": [4, 6, 8],
    #     "learning_rate": [0.03, 0.05, 0.1],
    #     "l2_leaf_reg": [3, 5, 7],
    #     "bagging_temperature": [0, 1, 3],
    #     "border_count": [32, 64, 128]
    # }
}


In [162]:


def eval_metrics(y_true, y_pred, y_prob=None):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred)
    }

    # ROC-AUC only if probability scores are available
    if y_prob is not None:
        metrics["roc_auc"] = roc_auc_score(y_true, y_prob)

    return metrics


In [163]:
def evalute_model(X_train, X_test, y_train, y_test, models,param):
    report = {}

    for model_name, model in models.items():
        # Train
       
        gs=GridSearchCV(model,param_grid=param[model_name],cv=5,verbose=1,n_jobs=-1)
        gs.fit(X_train,y_train)
        
        best_model = gs.best_estimator_

        # Predictions
        y_pred = best_model.predict(X_test)

        # Probabilities (if supported)
        if hasattr(best_model, "predict_proba"):
            y_prob = best_model.predict_proba(X_test)[:, 1]
        else:
            y_prob = None

        # Evaluate metrics
        report[model_name] = {
            "best_params": gs.best_params_,
            "metrics": eval_metrics(y_test, y_pred, y_prob)
        }

    return report


In [164]:
model_report=evalute_model(X_train,X_test,y_train,y_test,models,param)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [165]:
model_report

{'Logistic Regression': {'best_params': {'C': 0.1,
   'max_iter': 1000,
   'penalty': 'l2',
   'solver': 'lbfgs'},
  'metrics': {'accuracy': 0.6774891774891775,
   'precision': 0.6838046272493573,
   'recall': 0.910958904109589,
   'f1_score': 0.7812041116005873,
   'roc_auc': 0.6311442385173247}},
 'SVC': {'best_params': {'C': 1,
   'gamma': 'scale',
   'kernel': 'rbf',
   'probability': True},
  'metrics': {'accuracy': 0.6861471861471862,
   'precision': 0.6860759493670886,
   'recall': 0.928082191780822,
   'f1_score': 0.7889374090247453,
   'roc_auc': 0.6553585817888798}},
 'DecisionTree Classifier': {'best_params': {'criterion': 'entropy',
   'max_depth': 5,
   'min_samples_leaf': 1,
   'min_samples_split': 2},
  'metrics': {'accuracy': 0.6731601731601732,
   'precision': 0.6900269541778976,
   'recall': 0.8767123287671232,
   'f1_score': 0.7722473604826546,
   'roc_auc': 0.6844681708299758}},
 'RandomForest Classifier': {'best_params': {'max_depth': 10,
   'min_samples_leaf': 2,


In [166]:
def select_best_model(results):
    """
    Selects the best model based on maximum F1-score
    """
    best_model_name = None
    best_f1 = -1
    best_model_details = None

    for model_name, model_info in results.items():
        f1 = model_info["metrics"]["f1_score"]

        if f1 > best_f1:
            best_f1 = f1
            best_model_name = model_name
            best_model_details = model_info

    return best_model_name, best_model_details


In [167]:
best_model_name, best_model_info = select_best_model(model_report)
best_model_info,best_model_name

({'best_params': {'learning_rate': 0.01, 'n_estimators': 200},
  'metrics': {'accuracy': 0.6883116883116883,
   'precision': 0.6859296482412061,
   'recall': 0.934931506849315,
   'f1_score': 0.7913043478260869,
   'roc_auc': 0.7073025785656728}},
 'AdaBoostClassifier')

In [168]:
parameter={

        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1.0]
    }   



In [169]:
gs=GridSearchCV(AdaBoostClassifier(),param_grid=parameter,cv=5,n_jobs=-1,verbose=4)

In [170]:
model=gs.fit(X_train,y_train)

model.best_estimator_


Fitting 5 folds for each of 9 candidates, totalling 45 fits


0,1,2
,estimator,
,n_estimators,200
,learning_rate,0.01
,algorithm,'deprecated'
,random_state,


In [171]:
model=AdaBoostClassifier(**gs.best_params_)

In [172]:
model.fit(X_train,y_train)

0,1,2
,estimator,
,n_estimators,200
,learning_rate,0.01
,algorithm,'deprecated'
,random_state,


In [173]:
y_pred=model.predict(X_test)

In [174]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[ 45 125]
 [ 19 273]]
              precision    recall  f1-score   support

           0       0.70      0.26      0.38       170
           1       0.69      0.93      0.79       292

    accuracy                           0.69       462
   macro avg       0.69      0.60      0.59       462
weighted avg       0.69      0.69      0.64       462

0.6883116883116883


In [175]:
final_model = models[best_model_name].set_params( **best_model_info["best_params"] )

In [176]:
final_model.fit(X_train, y_train)

0,1,2
,estimator,
,n_estimators,200
,learning_rate,0.01
,algorithm,'deprecated'
,random_state,


In [177]:
y_pred=final_model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[ 45 125]
 [ 19 273]]
              precision    recall  f1-score   support

           0       0.70      0.26      0.38       170
           1       0.69      0.93      0.79       292

    accuracy                           0.69       462
   macro avg       0.69      0.60      0.59       462
weighted avg       0.69      0.69      0.64       462

0.6883116883116883


In [178]:
import joblib

joblib.dump(final_model, "models/model.pkl")
joblib.dump(scaler, "models/scaler_model.pkl")
joblib.dump(le, "models/label_en.pkl")


['models/label_en.pkl']

In [155]:
importance = pd.DataFrame({
    'feature': features,
    'coefficient': final_model.coef_[0]
}).sort_values(by='coefficient', ascending=False)

print(importance)

AttributeError: 'AdaBoostClassifier' object has no attribute 'coef_'