<a href="https://colab.research.google.com/github/anirbanghoshsbi/.github.io/blob/master/yeappppppy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('stock_weekly_returns.csv')
df=df.dropna()
# Preprocess the data
df['date'] = pd.to_datetime(df['date'])
df['outperform'] = (df['weekly_return'] > 0).astype(int)
# Encode ticker as numeric
label_encoder = LabelEncoder()
df['ticker_encoded'] = label_encoder.fit_transform(df['ticker'])

# Select features for the model
features = ['ticker_encoded','dollar_volume', 'adj close', 'atr', 'bb_high', 'bb_low', 'bb_mid',
            'garman_klass_vol', 'macd', 'rsi', 'return_1m', 'return_2m',
            'return_3m', 'return_6m', 'return_9m', 'return_12m']

X = df[features]
y = df['outperform']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Get probability estimates
y_prob = rf_classifier.predict_proba(X_test_scaled)

# Create a DataFrame with test data, predictions, and probabilities
results_df = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred,
    'prob_underperform': y_prob[:, 0],
    'prob_outperform': y_prob[:, 1],

})



# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)
# Add original features to the results DataFrame
results_df = pd.concat([results_df, X_test.reset_index(drop=True)], axis=1)

# Function to predict for new data
def predict_performance(new_data):
    new_data_scaled = scaler.transform(new_data)
    prediction = rf_classifier.predict_proba(new_data_scaled)
    return prediction
# Function to predict and rank stocks


Confusion Matrix:
[[239 340]
 [244 427]]

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.41      0.45       579
           1       0.56      0.64      0.59       671

    accuracy                           0.53      1250
   macro avg       0.53      0.52      0.52      1250
weighted avg       0.53      0.53      0.53      1250


Feature Importance:
             feature  importance
3                atr    0.076889
15        return_12m    0.073408
14         return_9m    0.070769
10         return_1m    0.070365
11         return_2m    0.069744
1      dollar_volume    0.069540
12         return_3m    0.069471
13         return_6m    0.068269
7   garman_klass_vol    0.067898
9                rsi    0.065739
8               macd    0.065369
5             bb_low    0.047971
4            bb_high    0.047876
2          adj close    0.047546
6             bb_mid    0.046419
0     ticker_encoded    0.042728


In [59]:
# Example usage with test data
outperform_ranked = predict_performance(X_test)
print("Top 10 stocks predicted to outperform, ranked by confidence:")
print(outperform_ranked)


Top 10 stocks predicted to outperform, ranked by confidence:
[[0.59 0.41]
 [0.41 0.59]
 [0.43 0.57]
 ...
 [0.37 0.63]
 [0.19 0.81]
 [0.41 0.59]]


In [60]:
results_df.dropna(inplace=True)

In [61]:
results_df['final_names']=label_encoder.inverse_transform(results_df['ticker_encoded'].astype(int))

In [62]:
results_df.head()

Unnamed: 0,actual,predicted,prob_underperform,prob_outperform,ticker_encoded,dollar_volume,adj close,atr,bb_high,bb_low,...,garman_klass_vol,macd,rsi,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,final_names
611,1.0,1.0,0.41,0.59,43.0,4585.543232,103.137939,-1.470213,4.665219,4.604446,...,-0.001759,-0.121426,52.3173,0.006485,0.011707,0.003553,0.010417,0.001702,-0.000115,TATASTEEL.NS
476,1.0,1.0,0.29,0.71,44.0,9793.463234,3494.97876,-0.059875,8.176199,7.983771,...,-0.000656,2.458645,80.755006,0.045133,0.036404,0.039685,0.025698,0.010929,0.014312,TCS.NS
296,0.0,0.0,0.59,0.41,20.0,670.413266,2275.347412,-1.131612,7.812853,7.725985,...,-4.4e-05,-1.152993,30.837747,-0.017748,-0.026359,-0.020485,-0.014004,-0.016076,-0.011853,HEROMOTOCO.NS
351,0.0,1.0,0.29,0.71,18.0,11918.47636,1549.771851,-0.111646,7.408553,7.346865,...,-0.000507,-0.316099,42.026703,-0.040431,-0.020779,-0.014238,-0.001222,-0.000607,-0.000946,HDFCBANK.NS
80,0.0,0.0,0.5,0.5,36.0,1552.996438,150.519485,0.51473,5.067308,5.001979,...,-0.003662,-1.241449,47.224931,-0.006858,-0.000556,-0.00674,-0.00243,-0.003955,0.004945,POWERGRID.NS


In [63]:
results_df.to_csv('results.csv', index=False)

In [64]:
#results_df.sort_values(by='prob_outperform', ascending=False)[['final_names','date']].head(10)

In [65]:
df=pd.read_csv('/content/nifty_50_weekly_data_lat.csv')

In [66]:
import pandas as pd

# Assuming your dataframe is called 'df' and the date column is named 'date'
df['date'] = pd.to_datetime(df['date'])  # Convert to datetime if not already
live_data = df[df['date'] > '2023-11-01'].sort_values('date')

In [67]:
live_data['date'] = pd.to_datetime(live_data['date'])
#live_data['outperform'] = (df['weekly_return'] > 0).astype(int)
# Encode ticker as numeric
label_encoder = LabelEncoder()
live_data['ticker_encoded'] = label_encoder.fit_transform(live_data['ticker'])
model_data = live_data.drop(['ticker', 'date'], axis=1)

In [68]:
model_data = model_data.reindex(columns=features)

In [69]:
final_output=predict_performance(model_data)

In [70]:
final_output

array([[0.3 , 0.7 ],
       [0.49, 0.51],
       [0.32, 0.68],
       ...,
       [0.34, 0.66],
       [0.57, 0.43],
       [0.4 , 0.6 ]])

In [71]:
# Create a DataFrame with test data, predictions, and probabilities
results_live_findf = pd.DataFrame({
    'prob_underperform_live_dataset': final_output[:, 0],
    'prob_outperform_live_dataset': final_output[:, 1],
})

In [72]:
live_results = pd.concat([results_live_findf, live_data.reset_index(drop=True)], axis=1)

In [73]:
live_results['final_names']=label_encoder.inverse_transform(live_results['ticker_encoded'].astype(int))

In [74]:
live_results.tail()

Unnamed: 0,prob_underperform_live_dataset,prob_outperform_live_dataset,date,ticker,dollar_volume,adj close,atr,bb_high,bb_low,bb_mid,...,macd,rsi,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,ticker_encoded,final_names
1545,0.37,0.63,2024-06-02,HDFCLIFE.NS,2947.169029,548.883362,-0.608634,6.360628,6.294372,6.3275,...,-0.662192,39.356375,-0.025394,-0.017203,0.001335,-0.014597,-0.015407,-0.01008,19,HDFCLIFE.NS
1546,0.45,0.55,2024-06-02,HEROMOTOCO.NS,2540.316606,5136.0,2.116908,8.600461,8.400306,8.500384,...,2.348703,64.501684,0.003458,0.004587,0.01737,0.033539,0.009375,0.008266,20,HEROMOTOCO.NS
1547,0.34,0.66,2024-06-02,HINDALCO.NS,9777.37679,695.150024,1.863935,6.567515,6.411884,6.4897,...,1.570485,66.793972,0.032529,0.029839,0.035764,0.020697,0.024261,0.021791,21,HINDALCO.NS
1548,0.57,0.43,2024-06-02,BRITANNIA.NS,1609.073331,5198.549805,1.821535,8.59893,8.484888,8.541909,...,1.362017,58.255785,-0.008043,0.010925,0.008593,0.018093,0.006337,0.005127,10,BRITANNIA.NS
1549,0.4,0.6,2024-06-02,WIPRO.NS,4899.042141,436.950012,-0.619205,6.156059,6.099268,6.127664,...,-0.521456,34.777793,-0.057587,-0.026434,-0.011115,-0.005903,-0.010409,-0.013682,49,WIPRO.NS


In [75]:
live_results.to_csv('live_results.csv', index=False)
