In [3]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Read the data
data = pd.read_csv('data.csv')

# Check for missing values
print(data.isnull().sum())

# Define features and targets
features = ['min_sip','min_lumpsum','expense_ratio','fund_size_cr','fund_age_yr','sortino','alpha','sd','sharpe','beta']
targets = ['category','sub_category','scheme_name','amc_name']

# Encode target variables
label_encoders = {}
for target in targets:
    label_encoders[target] = LabelEncoder()
    data[target] = label_encoders[target].fit_transform(data[target])

# Separate features and targets
X = data[features]
y = data[targets]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features),
    ])

# Initialize and train the RandomForestRegressor
classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(random_state=42))
])

classifier.fit(X_train, y_train)

# Make predictions
y_prediction = classifier.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_prediction)
print("Mean Squared Error:", mse)

# Save the trained model
joblib.dump(classifier, 'model.pkl')


scheme_name      0
min_sip          0
min_lumpsum      0
expense_ratio    0
fund_size_cr     0
fund_age_yr      0
fund_manager     0
sortino          0
alpha            0
sd               0
beta             0
sharpe           0
risk_level       0
amc_name         0
rating           0
category         0
sub_category     0
returns_1yr      0
returns_3yr      0
returns_5yr      0
dtype: int64
Mean Squared Error: 13093.635777300615


['model.pkl']