In [32]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [33]:
# Load data function
def load_data(category):
    data = pd.DataFrame()
    directory = f"./{category}/"
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            data = pd.concat([data, df], axis=0)
    return data

# Load data for each category
large_cap_data = load_data("large_cap")
mid_cap_data = load_data("mid_cap")
small_cap_data = load_data("small_cap")

# Define features and target variable
features = ['Open', 'High', 'Low', 'Volume']
target = 'Close'

# Train and evaluate models
best_models = {}
for category, train_data in [("large_cap", large_cap_data), ("mid_cap", mid_cap_data), ("small_cap", small_cap_data)]:
    X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[target], test_size=0.4, random_state=42)
    best_model_category = None
    best_mse = float('inf')
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', model)
        ])
        print(f"Fitting model {model_name} for category {category}")
        grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=3, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        mse = -grid_search.best_score_  # GridSearchCV uses neg_mean_squared_error
        if mse < best_mse:
            best_mse = mse
            best_model_category = model_name
            best_models[category] = grid_search.best_estimator_
    print(f"Best model for {category}: {best_model_category} (MSE: {best_mse})")

# Evaluate models on test data
for category, model in best_models.items():
    X_test = test_data[features]
    y_test = test_data[target]
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"MSE for {category}: {mse}")


Fitting model Ridge for category large_cap
Fitting model RandomForest for category large_cap
Fitting model GradientBoosting for category large_cap
Best model for large_cap: Ridge (MSE: 515.2925068308336)
Fitting model Ridge for category mid_cap
Fitting model RandomForest for category mid_cap
Fitting model GradientBoosting for category mid_cap
Best model for mid_cap: Ridge (MSE: 2182.6444153483794)
Fitting model Ridge for category small_cap
Fitting model RandomForest for category small_cap
Fitting model GradientBoosting for category small_cap
Best model for small_cap: Ridge (MSE: 1254.4709329117445)
MSE for large_cap: 1164.812240943525
MSE for mid_cap: 1321.330825289709
MSE for small_cap: 1101.6917146382827


In [34]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv("small_cap/NBCC.NS_stock_data.csv")

# Now you can work with the data using pandas DataFrame operations
print(data.head())  # Display the first few rows of the data


         Date       Open       High        Low      Close  Adj Close    Volume
0  2017-01-02  80.349998  84.916664  80.349998  84.483330  78.137337  10800348
1  2017-01-03  85.933334  86.583336  84.000000  84.816666  78.445625   6187929
2  2017-01-04  85.483330  85.966667  81.699997  82.366669  76.179672   9914781
3  2017-01-05  82.833336  84.000000  80.133331  83.199997  76.950401   5573718
4  2017-01-06  83.316666  84.766663  83.216667  83.650002  77.366600   2417802
