In [6]:
# Standard library imports
import os

# Third party library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [7]:
BLINDED_FILE_PATH = "../datasets/TASK_2/TASK_2/blinded_test_set.csv"
blinded_test_data = pd.read_csv(BLINDED_FILE_PATH)

In [8]:
# Reading the csv file

FILE_PATH = "../datasets/TASK_2/TASK_2/train_set.csv"
train_data = pd.read_csv(FILE_PATH)

In [9]:
TEST_PATH = "../datasets/TASK_2/TASK_2/test_set.csv"
test_data = pd.read_csv(TEST_PATH)

In [10]:
train_data['CLASS'].value_counts().reset_index()

Unnamed: 0,CLASS,count
0,0,191
1,1,124


In [11]:
test_data['CLASS'].value_counts().reset_index()

Unnamed: 0,CLASS,count
0,0,58
1,1,42


In [12]:
X_test = test_data.drop(columns=["ID", "CLASS"])
y_test = test_data["CLASS"]

In [13]:
X_train = train_data.drop(columns=["ID", "CLASS"])
y_train = train_data["CLASS"]

In [14]:
X_test.duplicated().sum()

np.int64(0)

In [15]:
len(set(train_data["ID"]).intersection(set(test_data["ID"])))

100

In [16]:
train_data.select_dtypes(include=['number']).columns

Index(['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
       'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10',
       ...
       'Feature_3230', 'Feature_3231', 'Feature_3232', 'Feature_3233',
       'Feature_3234', 'Feature_3235', 'Feature_3236', 'Feature_3237',
       'Feature_3238', 'CLASS'],
      dtype='object', length=3239)

In [17]:
train_data.dtypes

ID               object
Feature_1       float64
Feature_2       float64
Feature_3       float64
Feature_4       float64
                 ...   
Feature_3235    float64
Feature_3236    float64
Feature_3237    float64
Feature_3238    float64
CLASS             int64
Length: 3240, dtype: object

In [18]:
train_dist= train_data["CLASS"].value_counts(normalize=True)
test_dist = test_data["CLASS"].value_counts(normalize=True)
dist_diff = abs(train_dist - test_dist).sum()
if dist_diff > 0.1:
    print(f"Warning: There is a large distribution difference: {dist_diff:.3f}")
else:
    print(f"Distribution difference is acceptable: {dist_diff:.3f}")


Distribution difference is acceptable: 0.053


In [19]:
y_train.isna().sum()

np.int64(0)

In [20]:
missing_per_row = X_train.isnull().sum(axis=1)
missing_per_row.value_counts().reset_index()


Unnamed: 0,index,count
0,0,199
1,23,116


In [21]:
import numpy as np

# Count infinities
np.isinf(X_train.to_numpy()).sum()


np.int64(4)

In [22]:
inf_mask = np.isinf(X_train)
inf_summary = inf_mask.sum(axis=0)
inf_summary[inf_summary > 0]

Feature_72    2
Feature_90    2
dtype: int64

In [23]:
X_clean = X_train.replace([np.inf, -np.inf], np.nan)

In [24]:
X_test = X_test.replace([np.inf, -np.inf], np.nan)

In [25]:
X_train.iloc[50].isnull().value_counts().reset_index()

Unnamed: 0,50,count
0,False,3215
1,True,23


In [26]:
from sklearn.impute import SimpleImputer, KNNImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="median")
X_imputed = imputer.fit_transform(X_clean)
X_test_simple = imputer.transform(X_test)

knn_imputer = KNNImputer(n_neighbors=5)
X_imputed_knn = knn_imputer.fit_transform(X_clean)
X_test_knn = knn_imputer.transform(X_test)

In [29]:
from sklearn.preprocessing import StandardScaler, RobustScaler

scalar = StandardScaler()
X_scaled = scalar.fit_transform(X_imputed_knn)
X_test = scalar.transform(X_test_knn)

In [28]:
# scalar.mean_, scalar.var_

In [30]:
X_scaled_df = pd.DataFrame(X_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test, columns=X_train.columns)
X_scaled_df


Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_3229,Feature_3230,Feature_3231,Feature_3232,Feature_3233,Feature_3234,Feature_3235,Feature_3236,Feature_3237,Feature_3238
0,-0.756859,-0.756281,-0.768458,1.790937,-0.703377,-0.622357,0.757830,-0.722168,0.757830,0.038485,...,-1.903318,-1.903318,-0.795461,0.0,-0.386250,-1.156615,0.0,-0.208265,1.487482,-1.156615
1,-0.684660,-0.686690,-0.980408,0.279318,0.567037,0.393269,-0.768013,0.617284,-0.768013,-0.059407,...,-0.133632,-0.133632,-0.683039,0.0,-0.309647,-0.493571,0.0,-0.337385,0.542650,-0.493571
2,-0.381832,-0.380581,-0.235997,0.738313,-0.658969,-0.592319,0.689108,-0.672790,0.689108,-0.062952,...,-0.972867,-0.972867,-0.396858,0.0,-0.308825,-0.995803,0.0,-0.354606,1.137855,-0.995803
3,0.231255,0.232764,0.851293,0.408853,-1.186714,-0.927422,1.599928,-1.272312,1.599928,-0.063502,...,-0.486524,-0.486524,0.283390,0.0,-0.154057,-0.763600,0.0,-0.374634,0.852824,-0.763600
4,0.203521,0.203601,0.171085,-0.825213,0.093786,-0.031715,-0.308667,0.138226,-0.308667,-0.058745,...,0.680798,0.680798,0.173152,0.0,0.084921,0.703383,0.0,0.196797,-0.859371,0.703383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,0.433827,0.435696,0.886849,-0.217469,-0.847973,-0.717820,0.990873,-0.884254,0.990873,-0.060837,...,0.202267,0.202267,0.498488,0.0,-0.027807,-0.355315,0.0,0.162030,0.397758,-0.355315
311,-1.168752,-1.170949,-1.748648,1.717631,1.244825,1.018591,-1.401877,1.297774,-1.401877,-0.060507,...,-1.901386,-1.901386,-1.207844,0.0,-0.440121,-0.957524,0.0,0.652483,1.190021,-0.957524
312,0.035969,0.035381,0.101320,-0.395710,-0.196988,-0.259780,0.043164,-0.169541,0.043164,-0.058273,...,0.271247,0.271247,0.120722,0.0,-0.091668,-0.222419,0.0,-0.963045,0.088390,-0.222419
313,-0.472817,-0.471784,-0.331835,1.032608,-0.739118,-0.646287,0.814087,-0.762044,0.814087,-0.061786,...,-1.381913,-1.381913,-0.550115,0.0,-0.323534,-0.894995,0.0,0.392094,1.023034,-0.894995


In [31]:
feature_columns = X_train.columns

In [26]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
X_var = selector.fit_transform(X_scaled_df)
X_test = selector.transform(X_test)


print(f"Reduced from {X_scaled_df.shape[1]} to {X_var.shape[1]} features")

selected_mask = selector.get_support()

selected_features = [feature_columns[i] for i in range(len(selected_mask)) if selected_mask[i]]

Reduced from 3238 to 3092 features


In [27]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=100)  # Try k=100 to start

X_kbest = selector.fit_transform(X_var, y_train)
X_test = selector.transform(X_test)

# You can also get the selected column names if needed
selected_mask = selector.get_support()

selected_k_features = [selected_features[i] for i in range(len(selected_mask)) if selected_mask[i]]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

estimator = RandomForestClassifier(n_estimators=30, random_state=42)
feature_selector = RFE(estimator=estimator, n_features_to_select=10)
X_kbest = feature_selector.fit_transform(X_scaled_df, y_train)  # y = your target column (CLASS)
X_test = feature_selector.transform(X_test)

selected_mask = feature_selector.get_support()
selected_rfe_features = [feature_columns[i] for i in range(len(selected_mask)) if selected_mask[i]]

In [26]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier


estimator = RandomForestClassifier(n_estimators=50, random_state=42)
selector = SelectFromModel(estimator, max_features=100)

X_kbest = selector.fit_transform(X_scaled_df, y_train)  # y = your target column (CLASS)
X_test = selector.transform(X_test)

selected_mask = selector.get_support()
selected_rfe_features = [feature_columns[i] for i in range(len(selected_mask)) if selected_mask[i]]



In [28]:
X_kbest_df = pd.DataFrame(X_kbest, columns=selected_k_features)
X_test = pd.DataFrame(X_test, columns=selected_k_features)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models_config = {

    'Logistic Regression': {

        'model': LogisticRegression(random_state=42, max_iter=2000),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 5, 10, 50, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga']
        }
    },

    'Random Forest': {

        'model': RandomForestClassifier(random_state=42, n_estimators=100),
        'params':{
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10, 20],
            'min_samples_leaf': [1, 2, 4, 8],
            'max_features': ['sqrt', 'log2', 0.3, 0.5]
        }
    },

    'Gradient Boosting': {

        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 0.9, 1.0],
            'min_samples_split': [2, 5, 10]
        }

    }

}

In [32]:
lr_params = models_config["Logistic Regression"]["params"]
param_combinations = []

for C in lr_params["C"]:
    for penalty in lr_params["penalty"]:
        for solver in lr_params["solver"]:
            if penalty == "elasticnet" and solver != "saga":
                for l1_ratio in lr_params["l1_ratio"]:
                    param_combinations.append({
                        'C': C,
                        'penalty': penalty,
                        'solver': solver,
                        'l1_ratio': l1_ratio
                    })
            elif penalty != "elasticnet":
                if (penalty == "l1" and solver in ["liblinear", "saga"]) or (penalty == "l2" and solver in ["liblinear", "saga"]):
                    param_combinations.append({
                        'C': C,
                        'penalty': penalty,
                        'solver': solver
                    })

models_config["Logistic Regression"]["params"] = param_combinations

In [30]:
models_config

{'Logistic Regression': {'model': LogisticRegression(max_iter=2000, random_state=42),
  'params': {'C': [0.001, 0.01, 0.1, 1, 5, 10, 50, 100],
   'penalty': ['l1', 'l2'],
   'solver': ['liblinear', 'saga']}},
 'Random Forest': {'model': RandomForestClassifier(random_state=42),
  'params': {'n_estimators': [100, 200, 300],
   'max_depth': [None, 5, 10, 15, 20],
   'min_samples_split': [2, 5, 10, 20],
   'min_samples_leaf': [1, 2, 4, 8],
   'max_features': ['sqrt', 'log2', 0.3, 0.5]}},
 'Gradient Boosting': {'model': GradientBoostingClassifier(random_state=42),
  'params': {'n_estimators': [100, 200],
   'learning_rate': [0.01, 0.1, 0.2],
   'max_depth': [3, 5, 7],
   'subsample': [0.8, 0.9, 1.0],
   'min_samples_split': [2, 5, 10]}}}

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = dict()

for name, config in models_config.items():
    print(f"Training {name}...")

    # if name == "Logistic Regression":
    #     best_score = 0
    #     best_params = None
    #     best_model = None
    #     for params in config['params'][:50]:
    #         model = LogisticRegression(random_state=42, max_iter=5000, class_weight="balanced", **params)
    #         try:
    #             scores = cross_val_score(model, X_kbest_df, y_train, cv=cv, scoring="accuracy")
    #             avg_score = np.mean(scores)
    #             if avg_score > best_score:
    #                 best_score = avg_score
    #                 best_params = params
    #                 best_model = model
    #         except:
    #             continue
    #     if best_model:
    #         best_model.fit(X_kbest_df, y_train)
    #         models[name] = {

    #             'model': best_model,
    #             'best_params': best_params,
    #             'cv_score': best_score

    #         }

    # else:

    grid_search = GridSearchCV(estimator=config['model'], param_grid=config['params'], cv=cv, n_jobs=-1, scoring='accuracy', verbose=0)
    grid_search.fit(X_kbest_df, y_train)
    models[name] = {
            'model': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'cv_score': grid_search.best_score_
        }   

    if name in models: 
        print(f"Best CV Score: {models[name]["cv_score"]:.4f}")
        print(f"Best Parameters: {models[name]["best_params"]}")

Training Logistic Regression...


Best CV Score: 0.6635
Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}
Training Random Forest...
Best CV Score: 0.6444
Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Training Gradient Boosting...
Best CV Score: 0.6317
Best Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}


In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

results = {}

for name, model_info in models.items():
    model = model_info['model']

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    accuracy = accuracy_score(y_test, y_pred)

    results[name] = {

        'accuracy': accuracy,
        'cv_score': model_info['cv_score'],
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

    print(f"\n{name}:")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"CV Score: {model_info['cv_score']:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    



Logistic Regression:
Test Accuracy: 0.6100
CV Score: 0.6635

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.84      0.72        58
           1       0.57      0.29      0.38        42

    accuracy                           0.61       100
   macro avg       0.60      0.57      0.55       100
weighted avg       0.60      0.61      0.57       100


Random Forest:
Test Accuracy: 0.5800
CV Score: 0.6444

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.81      0.69        58
           1       0.50      0.26      0.34        42

    accuracy                           0.58       100
   macro avg       0.55      0.54      0.52       100
weighted avg       0.56      0.58      0.55       100


Gradient Boosting:
Test Accuracy: 0.5700
CV Score: 0.6317

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.69      0.65 

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth= None, min_samples_leaf= 1, min_samples_split= 2)
clf_rf.fit(X_kbest_df, y_train)

y_pred_rf = clf_rf.predict(X_kbest_df)
print(confusion_matrix(y_train, y_pred_rf))
print(classification_report(y_train, y_pred_rf))


[[191   0]
 [  0 124]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       191
           1       1.00      1.00      1.00       124

    accuracy                           1.00       315
   macro avg       1.00      1.00      1.00       315
weighted avg       1.00      1.00      1.00       315



In [39]:
# Check feature distributions

feature_drift_count = 0

for col in selected_k_features:
    train_mean = X_kbest_df[col].mean()
    test_mean = X_test[col].mean()
    train_std = X_kbest_df[col].std()

    if train_std > 0:
        z_score = (train_mean - test_mean) / train_std
        if z_score > 2:
            feature_drift_count += 1

if feature_drift_count > len(selected_k_features) * 0.2:
    print(f"Warning: {feature_drift_count} features show significant distribution drift")
else:
    print(f"Feature distributions look stable ({feature_drift_count} potential drift)")

Feature distributions look stable (0 potential drift)


In [36]:
# Check for overfitting

for name, model_info in models.items():
    model = model_info['model']
    train_pred = model.predict(X_kbest_df)
    train_acc = accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred)

    overfitting = abs(train_acc - test_acc)
    print(f"{name}: Train Accuracy: {train_acc:.3f}, Test Accuracy: {test_acc:.3f}, Overfitting: {overfitting:.3f}")

    if overfitting > 0.1:
        print(f"Potential overfitting detected for {name} model")

Logistic Regression: Train Accuracy: 0.654, Test Accuracy: 0.610, Overfitting: 0.044
Random Forest: Train Accuracy: 1.000, Test Accuracy: 0.560, Overfitting: 0.440
Potential overfitting detected for Random Forest model
SVM: Train Accuracy: 0.698, Test Accuracy: 0.620, Overfitting: 0.078


In [39]:
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_kbest_df, y_train, cv=kf, scoring="accuracy")

