In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

# Load CSV
df = pd.read_csv("C:\\Users\\abuba\\OneDrive\\Documents\\practice1.csv")
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,target,category_1,category_2
0,-0.280283,0.757304,0.850983,-1.435563,-0.758349,-0.160653,3.302093,-2.722456,1.421218,-1.380652,-0.227744,-3.771355,0.412071,0.083452,0.605133,1,C,X
1,0.906834,0.483841,1.777295,1.313999,-1.652753,-1.912851,4.269466,-3.166588,3.232119,-1.451835,0.246883,-4.008159,-0.664701,0.613594,-0.042332,1,A,X
2,0.945312,-0.442589,0.089395,-2.642868,2.724612,-2.068376,1.235409,1.119473,-2.460589,-0.552569,4.843362,-0.522151,-1.873006,-1.625506,0.007728,1,C,X
3,1.302928,0.579062,0.727122,5.649452,-1.219417,-1.925262,-0.624623,0.550589,2.961265,-0.665032,-0.875058,2.213647,-1.431114,1.290764,0.467091,1,C,X
4,-0.302269,-0.068286,-0.128398,-0.217517,-0.78487,0.338758,-0.024687,-1.169681,0.505409,0.530867,-0.834478,-0.853072,-0.935831,-0.205742,-0.263424,0,A,Y


In [3]:
X = df.drop('target', axis=1)
y = df['target']

In [5]:
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [7]:
# Scaling numerical features, One-Hot Encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [9]:
# Feature selection: Top 10 numerical features
feature_selector = SelectKBest(score_func=f_classif, k=10)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),     
    ('feature_selection', feature_selector),  
    ('classifier', model)              
])

In [11]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')

print("CV Accuracy scores:", cv_scores)
print("Average CV Accuracy:", cv_scores.mean())

CV Accuracy scores: [0.93  0.905 0.94  0.945 0.95 ]
Average CV Accuracy: 0.9339999999999999


In [13]:
pipeline.fit(X, y)  # Full data train

In [15]:
# Example: predicting first 5 rows
pipeline.predict(X.head())

array([1, 1, 1, 1, 0], dtype=int64)

In [17]:

importances = pipeline.named_steps['classifier'].feature_importances_
feature_names_after_selection = pipeline.named_steps['feature_selection'].get_feature_names_out()
feature_importance_df = pd.DataFrame({
    'feature': feature_names_after_selection,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importance_df)

  feature  importance
4      x8    0.241191
1      x4    0.185591
3      x7    0.143144
7     x13    0.129835
5     x10    0.113344
0      x3    0.078163
2      x6    0.055671
8     x14    0.025421
6     x12    0.023677
9     x15    0.003962


In [19]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report

y_pred = cross_val_predict(pipeline, X, y, cv=5)
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))

[[465  37]
 [ 26 472]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       502
           1       0.93      0.95      0.94       498

    accuracy                           0.94      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000

