<a href="https://colab.research.google.com/github/anirbanghoshsbi/.github.io/blob/master/work/stock/yeappppppy_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('stock_weekly_returns.csv')
df=df.dropna()
# Preprocess the data
df['date'] = pd.to_datetime(df['date'])
df['outperform'] = (df['weekly_return'] > 0).astype(int)
# Encode ticker as numeric
label_encoder = LabelEncoder()
df['ticker_encoded'] = label_encoder.fit_transform(df['ticker'])

# Select features for the model
features = ['ticker_encoded','dollar_volume', 'adj close', 'atr', 'bb_high', 'bb_low', 'bb_mid',
            'garman_klass_vol', 'macd', 'rsi', 'return_1m', 'return_2m',
            'return_3m', 'return_6m', 'return_9m', 'return_12m']

X = df[features]
y = df['outperform']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Get probability estimates
y_prob = rf_classifier.predict_proba(X_test_scaled)

# Create a DataFrame with test data, predictions, and probabilities
results_df = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred,
    'prob_underperform': y_prob[:, 0],
    'prob_outperform': y_prob[:, 1],

})



# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)
# Add original features to the results DataFrame
results_df = pd.concat([results_df, X_test.reset_index(drop=True)], axis=1)

# Function to predict for new data
def predict_performance(new_data):
    new_data_scaled = scaler.transform(new_data)
    prediction = rf_classifier.predict_proba(new_data_scaled)
    return prediction
# Function to predict and rank stocks


Confusion Matrix:
[[129 189]
 [118 209]]

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.41      0.46       318
           1       0.53      0.64      0.58       327

    accuracy                           0.52       645
   macro avg       0.52      0.52      0.52       645
weighted avg       0.52      0.52      0.52       645


Feature Importance:
             feature  importance
3                atr    0.077616
15        return_12m    0.075650
14         return_9m    0.073477
10         return_1m    0.072165
11         return_2m    0.069896
12         return_3m    0.067243
1      dollar_volume    0.067157
13         return_6m    0.066174
7   garman_klass_vol    0.065962
8               macd    0.065389
9                rsi    0.064250
5             bb_low    0.049929
4            bb_high    0.049579
2          adj close    0.047265
6             bb_mid    0.046660
0     ticker_encoded    0.041589


In [10]:
import joblib


def save_model(model, scaler, label_encoder, filename='rf_model.joblib'):
    """
    Save the trained model, scaler, and label encoder to disk.

    Args:
    model (RandomForestClassifier): Trained Random Forest model
    scaler (StandardScaler): Fitted StandardScaler
    label_encoder (LabelEncoder): Fitted LabelEncoder
    filename (str): Name of the file to save the model (default: 'rf_model.joblib')
    """
    model_data = {
        'model': model,
        'scaler': scaler,
        'label_encoder': label_encoder
    }
    joblib.dump(model_data, filename)
    print(f"Model saved to {filename}")



In [11]:
save_model(rf_classifier, scaler, label_encoder)

Model saved to rf_model.joblib


In [None]:
# needed in predictions file (yet to be created)

'''
def load_model_and_predict(filename='rf_model.joblib'):
    """
    Load the model from disk and use it to make predictions.

    Args:
    filename (str): Name of the file containing the saved model (default: 'rf_model.joblib')

    Returns:
    function: A function that takes new data and returns predictions
    """
    model_data = joblib.load(filename)
    model = model_data['model']
    scaler = model_data['scaler']
    label_encoder = model_data['label_encoder']

    def predict(new_data):
        """
        Make predictions using the loaded model.

        Args:
        new_data (pd.DataFrame): New data to make predictions on

        Returns:
        np.array: Probability estimates for each class
        """
        # Ensure the new data has the same features as the training data
        required_features = ['ticker', 'dollar_volume', 'adj close', 'atr', 'bb_high', 'bb_low', 'bb_mid',
                             'garman_klass_vol', 'macd', 'rsi', 'return_1m', 'return_2m',
                             'return_3m', 'return_6m', 'return_9m', 'return_12m']

        if not all(feature in new_data.columns for feature in required_features):
            raise ValueError("New data does not contain all required features")

        # Encode the ticker
        new_data['ticker_encoded'] = label_encoder.transform(new_data['ticker'])

        # Select and order the features
        X = new_data[['ticker_encoded', 'dollar_volume', 'adj close', 'atr', 'bb_high', 'bb_low', 'bb_mid',
                      'garman_klass_vol', 'macd', 'rsi', 'return_1m', 'return_2m',
                      'return_3m', 'return_6m', 'return_9m', 'return_12m']]

        # Scale the features
        X_scaled = scaler.transform(X)

        # Make predictions
        predictions = model.predict_proba(X_scaled)

        return predictions

    print(f"Model loaded from {filename}")
    return predict

# Example usage:
# save_model(rf_classifier, scaler, label_encoder)
# predict_fn = load_model_and_predict()
# new_data = pd.DataFrame(...)  # Your new data here
# predictions = predict_fn(new_data)
'''

In [2]:
# Example usage with test data
outperform_ranked = predict_performance(X_test)
print("Top 10 stocks predicted to outperform, ranked by confidence:")
print(outperform_ranked)


Top 10 stocks predicted to outperform, ranked by confidence:
[[0.51 0.49]
 [0.52 0.48]
 [0.39 0.61]
 ...
 [0.23 0.77]
 [0.43 0.57]
 [0.45 0.55]]


In [3]:
results_df.dropna(inplace=True)

In [4]:
results_df['final_names']=label_encoder.inverse_transform(results_df['ticker_encoded'].astype(int))

In [5]:
results_df.head()

Unnamed: 0,actual,predicted,prob_underperform,prob_outperform,ticker_encoded,dollar_volume,adj close,atr,bb_high,bb_low,...,garman_klass_vol,macd,rsi,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,final_names
346,0.0,0.0,0.52,0.48,36.0,2742.786011,158.556274,0.118887,5.075541,5.006788,...,-0.002633,0.346064,60.573719,0.046762,0.028319,0.012763,0.021649,0.011519,0.013826,POWERGRID.NS
393,1.0,1.0,0.4,0.6,17.0,1884.253881,862.075195,-1.736243,6.775482,6.735313,...,-0.006658,0.14043,57.43864,-0.011048,0.004808,0.000441,0.009445,0.010169,-0.004074,HCLTECH.NS
194,1.0,1.0,0.45,0.55,3.0,8360.270079,2859.040527,3.737271,8.129831,7.870972,...,4e-05,-2.181681,41.555046,0.070796,-0.030472,-0.034577,-0.00982,-0.02183,-0.008384,ASIANPAINT.NS
30,1.0,1.0,0.47,0.53,1.0,1901.757825,713.179871,-0.781763,6.643517,6.515207,...,9e-05,-0.479271,46.607396,-0.03232,0.008615,0.015651,-0.003144,-0.006092,-0.000109,ADANIPORTS.NS
102,0.0,1.0,0.43,0.57,27.0,3073.329832,662.193176,0.122516,6.616035,6.435828,...,-0.000312,-1.017297,46.692949,0.019779,0.004591,-0.026577,-0.006288,0.004288,-0.003906,JSWSTEEL.NS


In [6]:
results_df.to_csv('results.csv', index=False)

In [7]:
#results_df.sort_values(by='prob_outperform', ascending=False)[['final_names','date']].head(10)

Make Predictions

In [None]:
df=pd.read_csv('/content/nifty_50_weekly_data_lat.csv')

In [None]:
import pandas as pd

# Assuming your dataframe is called 'df' and the date column is named 'date'
df['date'] = pd.to_datetime(df['date'])  # Convert to datetime if not already
live_data = df[df['date'] > '2023-11-01'].sort_values('date')

In [None]:
live_data['date'] = pd.to_datetime(live_data['date'])
#live_data['outperform'] = (df['weekly_return'] > 0).astype(int)
# Encode ticker as numeric
label_encoder = LabelEncoder()
live_data['ticker_encoded'] = label_encoder.fit_transform(live_data['ticker'])
model_data = live_data.drop(['ticker', 'date'], axis=1)

In [None]:
model_data = model_data.reindex(columns=features)

In [None]:
final_output=predict_performance(model_data)

In [None]:
final_output

In [None]:
# Create a DataFrame with test data, predictions, and probabilities
results_live_findf = pd.DataFrame({
    'prob_underperform_live_dataset': final_output[:, 0],
    'prob_outperform_live_dataset': final_output[:, 1],
})

In [None]:
live_results = pd.concat([results_live_findf, live_data.reset_index(drop=True)], axis=1)

In [None]:
live_results['final_names']=label_encoder.inverse_transform(live_results['ticker_encoded'].astype(int))

In [None]:
live_results.tail()

In [None]:
live_results.to_csv('live_results.csv', index=False)