In [23]:
!pip install scikit-learn==1.3.2

Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.2-cp312-cp312-win_amd64.whl (9.1 MB)
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.1 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.1 MB 1.2 MB/s eta 0:00:08
   ---- ----------------------------------- 1.0/9.1 MB 1.4 MB/s eta 0:00:06
   ----- ---------------------------------- 1.3/9.1 MB 1.4 MB/s eta 0:00:06
   -------- ------------------------------- 1.8/9.1 MB 1.6 MB/s eta 0:00:05
   ---------- ----------------------------- 2.4/9.1 MB 1.7 MB/s eta 0:00:04
   ------------ --------------------------- 2.9/9.1 MB 1.9 MB/s eta 0:00:04
   -------------- ------------------------- 3.4/9.1 MB 2.0 MB/s eta 0:00:03
   ------------------ --------------------- 4.2/9.1 MB 2.1 MB/s eta 0:00:03
   ---------

  You can safely remove it manually.


In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import joblib

In [2]:
CSV_FILE = 'Obfuscated_train.csv'         # <-- Change to your CSV file
TARGET_COLUMN = 'label'

In [3]:
df = pd.read_csv(CSV_FILE)
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]

In [4]:
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
clf3 = LogisticRegression(max_iter=2000, random_state=42)
ensemble = VotingClassifier(estimators=[
    ('rf', clf1),
    ('gb', clf2),
    ('lr', clf3)
], voting='soft')

In [5]:
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('ensemble', ensemble)
])


In [6]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=1)
print(f'5-Fold CV Accuracies: {scores}')
print(f'Average CV Accuracy: {scores.mean():.4f}')

5-Fold CV Accuracies: [0.99989334 1.         1.         1.         1.        ]
Average CV Accuracy: 1.0000


In [7]:
pipeline.fit(X, y)
joblib.dump(pipeline, 'obfuscated_model.joblib')
print("Model saved as 'obfuscated_model.joblib'")

Model saved as 'obfuscated_model.joblib'
