In [1]:
import os
import warnings

import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score,precision_recall_fscore_support

In [2]:
warnings.filterwarnings('ignore')

# Function to classify return
def classify_return(return_value):
    """
    Classify the given return value into different categories based on the value range.

    Parameters:
        return_value (float): The value to be classified.

    Returns:
        str: The category that the return value belongs to. Possible categories are:
             - 'Above +5' if the value is greater than 5
             - '+2.5 to +5' if the value is between 2.5 and 5 (inclusive)
             - '+0 to +2.5' if the value is between 0 and 2.5 (inclusive)
             - '0 to -5' if the value is between -5 and 0 (inclusive)
             - 'Below -5' if the value is less than -5
    """
    if return_value > 5:
        return 'Above +5'
    elif 2.5 < return_value <= 5:
        return '+2.5 to +5'
    elif 0 <= return_value <= 2.5:
        return '+0 to +2.5'
    elif -5 <= return_value < 0:
        return '0 to -5'
    else:
        return 'Below -5'


In [3]:
def process_group(old_data):

    label_encoder = LabelEncoder()
    Return_Class_old = old_data['پایانی*سهم*'].pct_change() * 100
    old_data['Return_Class'] = Return_Class_old.apply(classify_return)

    old_data['Return_Class'] = label_encoder.fit_transform(old_data['Return_Class'])
    old_data = old_data.dropna(subset=['Return_Class'])


    scaling = StandardScaler()
    # Select numeric columns except the target column
    numeric_columns = old_data.select_dtypes(include=[np.number]).drop(columns=['Return_Class']).columns.tolist()

    # Extract the subset of data with only numeric columns
    old_num_data = old_data[numeric_columns]



    y_old = old_data['Return_Class']
    X_old = scaling.fit_transform(old_num_data)





    # Train-test split for old and new data separately
    X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(X_old, y_old, test_size=0.2)





    model = RandomForestClassifier()
    # Fit the model
    model.fit(X_old_train, y_old_train)




    # Predict on the test set (either old or new)
    y_pred = model.predict(X_old_test)

    # Evaluation metrics for classification
    accuracy = accuracy_score(y_old_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_old_test, y_pred, average='weighted')

    # Retrieve feature importances from the classifier
    feature_importance = model.feature_importances_

    # Sort features by importance
    sorted_indices = np.argsort(feature_importance)[::-1]
    cumulative_importance = 0.0
    selected_features = []
    for idx in sorted_indices:
        cumulative_importance += feature_importance[idx]
        selected_features.append(numeric_columns[idx])
        if cumulative_importance >= 0.8:
            break

    return accuracy, precision, recall, f1, selected_features


In [14]:
data.describe()

Unnamed: 0,Date,حجم معاملات اوراق با درآمد ثابت بلوکی-بورس*بورس*,ارزش معاملات اوراق با درآمد ثابت بلوکی-بورس*بورس*,قیمت به درآمد-بورس*بورس*,ارزش بازار (دلار آزاد)-بورس*بورس*,درصد شناوری-بورس*بورس*,حجم معاملات سهام بلوکی-بورس*بورس*,سود خالص-آخرین فصل-بورس*بورس*,قیمت به سود -بورس*بورس*,بازده دارایی ها-بورس*بورس*,...,Kurtosis,Skewness,ROC20,Volume Momentum 20,TEMA20,Price Momentum 1M,PLRC12,Historical Volatility,Realized_Volatility,ATR14
count,3653,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,...,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0
mean,2018-08-31 20:10:11.004653824,438447.0,430557200000.0,2.281638,122230900000.0,0.186301,666339600.0,529651100000000.0,9.633698,0.091666,...,4.713439,0.098774,1.1336,481.480962,22093.879414,2220197.0,1200.416,0.062446,0.060827,914.102303
min,2013-09-01 00:00:00,0.0,0.0,1.096152,37352480000.0,0.1183,0.0,0.0,4.774093,0.026836,...,-3.0,-4.472136,-55.462773,-99.953354,0.0,0.0,-2698826.0,0.0,0.0,0.0
25%,2016-03-02 00:00:00,0.0,0.0,1.429423,85822430000.0,0.1604,34047000.0,106621100000000.0,7.222868,0.048358,...,0.937384,-0.978987,-3.230222,-60.477409,9371.750108,939900.0,-18022.9,0.024641,0.0248,252.142857
50%,2018-09-01 00:00:00,0.0,0.0,1.811765,101295200000.0,0.1717,142004600.0,162344100000000.0,8.128797,0.082551,...,2.942407,0.244355,0.221877,-3.466718,13762.069578,1389200.0,12176.03,0.044846,0.045092,529.214286
75%,2021-03-02 00:00:00,200000.0,200000000000.0,2.441286,157576600000.0,0.2088,429148700.0,962233900000000.0,10.0224,0.137861,...,6.916011,1.423527,4.878049,136.251709,37693.748442,3748900.0,71456.33,0.08341,0.08408,1322.142857
max,2023-08-31 00:00:00,26179000.0,25697810000000.0,9.78898,368146400000.0,0.2564,59132800000.0,2944121000000000.0,41.985589,0.171911,...,20.0,4.472136,51.25801,402177.037037,64767.80834,6564900.0,2538421.0,0.655362,0.495126,11193.0
std,,1293865.0,1262560000000.0,1.432441,55259450000.0,0.032111,2467628000.0,641021900000000.0,5.548645,0.043913,...,5.180535,1.896463,10.643188,7606.678763,16232.018587,1612738.0,438088.8,0.065931,0.056186,977.356056


In [15]:
data = data
final_result = { }
old_data = None
for (year, month), new_data in data.groupby([data['Date'].dt.year, data['Date'].dt.month]):
    mask = (data['Date'].dt.year == year) & (data['Date'].dt.month == month)
    filtered_data = data[mask]
    now_df = pd.DataFrame(filtered_data)


    if old_data is None:
        old_data = now_df

    else:
        mask2 = ((data['Date'].dt.year == year) & (data['Date'].dt.month < month)) | ((data['Date'].dt.year < year))
        old_data_filtered = old_data[mask2]
        old_data = pd.concat([old_data_filtered, now_df])
    print(f'wonk for period : {year}-{month} ...')
    scores = process_group(old_data.copy())  # Ensure copies to prevent data contamination

    accuracy, precision, recall, f1, selected_features = scores
    final_result[f'{year}-{month}'] = {'Accuracy': accuracy,
                                        'Precision': precision,
                                        'Recall' : recall,
                                        'F1': f1,
                                        'Selected Features': selected_features
                                        }
    print(f'Accuracy: {accuracy * 100:.2f}%')
fff = pd.DataFrame(final_result)


wonk for period : 2013-9 ...
Accuracy: 66.67%
wonk for period : 2013-10 ...
Accuracy: 92.31%
wonk for period : 2013-11 ...
Accuracy: 94.74%
wonk for period : 2013-12 ...
Accuracy: 92.00%
wonk for period : 2014-1 ...
Accuracy: 87.10%
wonk for period : 2014-2 ...
Accuracy: 86.49%
wonk for period : 2014-3 ...
Accuracy: 88.37%
wonk for period : 2014-4 ...
Accuracy: 87.76%
wonk for period : 2014-5 ...
Accuracy: 87.27%
wonk for period : 2014-6 ...
Accuracy: 80.33%
wonk for period : 2014-7 ...
Accuracy: 89.55%
wonk for period : 2014-8 ...
Accuracy: 87.67%
wonk for period : 2014-9 ...
Accuracy: 86.08%
wonk for period : 2014-10 ...
Accuracy: 93.02%
wonk for period : 2014-11 ...
Accuracy: 92.39%
wonk for period : 2014-12 ...
Accuracy: 86.73%
wonk for period : 2015-1 ...
Accuracy: 85.58%
wonk for period : 2015-2 ...
Accuracy: 87.27%
wonk for period : 2015-3 ...
Accuracy: 87.07%
wonk for period : 2015-4 ...
Accuracy: 95.90%
wonk for period : 2015-5 ...
Accuracy: 88.28%
wonk for period : 2015-6 ...

KeyboardInterrupt: 

In [6]:
data.describe()

Unnamed: 0,Date,حجم معاملات اوراق با درآمد ثابت بلوکی-بورس*بورس*,ارزش معاملات اوراق با درآمد ثابت بلوکی-بورس*بورس*,قیمت به درآمد-بورس*بورس*,ارزش بازار (دلار آزاد)-بورس*بورس*,درصد شناوری-بورس*بورس*,حجم معاملات سهام بلوکی-بورس*بورس*,سود خالص-آخرین فصل-بورس*بورس*,قیمت به سود -بورس*بورس*,بازده دارایی ها-بورس*بورس*,...,Kurtosis,Skewness,ROC20,Volume Momentum 20,TEMA20,Price Momentum 1M,PLRC12,Historical Volatility,Realized_Volatility,ATR14
count,3653,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,...,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0,3653.0
mean,2018-08-31 20:10:11.004653824,438447.0,430557200000.0,2.281638,122230900000.0,0.186301,666339600.0,529651100000000.0,9.633698,0.091666,...,4.713439,0.098774,inf,inf,22093.879414,2220197.0,1200.416,0.062446,0.060827,914.102303
min,2013-09-01 00:00:00,0.0,0.0,1.096152,37352480000.0,0.1183,0.0,0.0,4.774093,0.026836,...,-3.0,-4.472136,-55.462773,-99.953354,0.0,0.0,-2698826.0,0.0,0.0,0.0
25%,2016-03-02 00:00:00,0.0,0.0,1.429423,85822430000.0,0.1604,34047000.0,106621100000000.0,7.222868,0.048358,...,0.937384,-0.978987,-3.230222,-60.477409,9371.750108,939900.0,-18022.9,0.024641,0.0248,252.142857
50%,2018-09-01 00:00:00,0.0,0.0,1.811765,101295200000.0,0.1717,142004600.0,162344100000000.0,8.128797,0.082551,...,2.942407,0.244355,0.294365,-3.466718,13762.069578,1389200.0,12176.03,0.044846,0.045092,529.214286
75%,2021-03-02 00:00:00,200000.0,200000000000.0,2.441286,157576600000.0,0.2088,429148700.0,962233900000000.0,10.0224,0.137861,...,6.916011,1.423527,5.085475,138.873538,37693.748442,3748900.0,71456.33,0.08341,0.08408,1322.142857
max,2023-08-31 00:00:00,26179000.0,25697810000000.0,9.78898,368146400000.0,0.2564,59132800000.0,2944121000000000.0,41.985589,0.171911,...,20.0,4.472136,inf,inf,64767.80834,6564900.0,2538421.0,0.655362,0.495126,11193.0
std,,1293865.0,1262560000000.0,1.432441,55259450000.0,0.032111,2467628000.0,641021900000000.0,5.548645,0.043913,...,5.180535,1.896463,,,16232.018587,1612738.0,438088.8,0.065931,0.056186,977.356056


In [7]:
data.head()

Unnamed: 0,Date,تاریخ شمسی,حجم معاملات اوراق با درآمد ثابت بلوکی-بورس*بورس*,ارزش معاملات اوراق با درآمد ثابت بلوکی-بورس*بورس*,قیمت به درآمد-بورس*بورس*,ارزش بازار (دلار آزاد)-بورس*بورس*,درصد شناوری-بورس*بورس*,حجم معاملات سهام بلوکی-بورس*بورس*,سود خالص-آخرین فصل-بورس*بورس*,قیمت به سود -بورس*بورس*,...,Kurtosis,Skewness,ROC20,Volume Momentum 20,TEMA20,Price Momentum 1M,PLRC12,Historical Volatility,Realized_Volatility,ATR14
0,2013-09-01,1392/6/10,0,0,1.758179,89707530000.0,0.200076,0,140964434000000,6.740156,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
1,2013-09-02,1392/6/11,0,0,1.758179,89707530000.0,0.200076,0,140964434000000,6.740156,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
2,2013-09-03,1392/6/12,0,0,1.75582,88743630000.0,0.186337,0,140924110000000,6.729593,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
3,2013-09-04,1392/6/13,0,0,1.731691,86590310000.0,0.18737,0,140924110000000,6.636411,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
4,2013-09-05,1392/6/14,0,0,1.731691,86590310000.0,0.18737,0,140924110000000,6.636411,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


In [9]:
import pandas as pd
import numpy as np

# Check for NaN or infinity values in each numeric column
for col in data.select_dtypes(include=np.number).columns:
    if data[col].isnull().values.any():
        print(f"Column '{col}' contains NaN values")
    if np.isinf(data[col]).any():
        print(f"Column '{col}' contains infinity values")



Column 'ROC20' contains infinity values
Column 'Volume Momentum 20' contains infinity values


In [10]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(method='ffill', inplace=True)

In [11]:
import pandas as pd
import numpy as np

# Check for NaN or infinity values in each numeric column
for col in data.select_dtypes(include=np.number).columns:
    if data[col].isnull().values.any():
        print(f"Column '{col}' contains NaN values")
    if np.isinf(data[col]).any():
        print(f"Column '{col}' contains infinity values")

In [17]:
# Check for NaN or infinity values in each numeric column
def check_for_nan_inf(data):
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().values.any():
            print(f"Column '{col}' contains NaN values")
        if np.isinf(data[col]).any():
            print(f"Column '{col}' contains infinity values")
    return

In [18]:
data2 = pd.read_excel('./FINNAL_DATA/زاگرس.xlsx')

In [19]:
check_for_nan_inf(data2)

Column 'Sharp Ratio 20' contains infinity values


In [23]:
def fix_inf_values(data2) :
    data2.replace([np.inf, -np.inf], np.nan, inplace=True)
    data2.fillna(method='ffill', inplace=True)
    data2.fillna(0, inplace=True)
    return data2

In [24]:
check_for_nan_inf(data2)

In [25]:
folder_path = './FINNAL_DATA'
file_list = os.listdir(folder_path)
for file in file_list:
    df_path = f'{folder_path}/{file}'
    finnal_df = fix_inf_values(pd.read_excel(df_path))
    finnal_df.to_excel(f'./FINNAL_DATA2/{file}', index=False)

In [26]:
ppp = pd.read_excel('./FINNAL_DATA2/خودرو.xlsx')

In [28]:
check_for_nan_inf(ppp)

In [30]:
ppp.isna().sum().sum()

0