In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import joblib

In [2]:
CSV_FILE = 'PCA_train.csv'         # <-- Change to your CSV file
TARGET_COLUMN = 'label'

In [3]:
df = pd.read_csv(CSV_FILE)
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]

In [4]:
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
clf3 = LogisticRegression(max_iter=2000, random_state=42)
ensemble = VotingClassifier(estimators=[
    ('rf', clf1),
    ('gb', clf2),
    ('lr', clf3)
], voting='soft')

In [5]:
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('ensemble', ensemble)
])


In [6]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'5-Fold CV Accuracies: {scores}')
print(f'Average CV Accuracy: {scores.mean():.4f}')

5-Fold CV Accuracies: [0.82727273 0.86520947 0.84699454 0.83060109 0.8579235 ]
Average CV Accuracy: 0.8456


In [7]:
pipeline.fit(X, y)
joblib.dump(pipeline, 'pca_model.joblib')
print("Model saved as 'pca_model.joblib'")

Model saved as 'pca_model.joblib'
