In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
# --- 1. Load and Clean Training Data ---
train_df = pd.read_csv('../train_set.csv')

In [3]:
train_df

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_3230,Feature_3231,Feature_3232,Feature_3233,Feature_3234,Feature_3235,Feature_3236,Feature_3237,Feature_3238,CLASS
0,ID_1,18281.541667,18432.0,9409.650391,0.514708,0.011300,0.045369,2.803803,0.356658,1.803803,...,382.968383,2214.0,1.0,136.625113,0.061710,0.0,28.154838,4.174959,0.061710,0
1,ID_2,20010.083333,20100.0,8303.049072,0.417707,0.014959,0.080294,2.338398,0.429532,1.338398,...,452.986164,2548.5,1.0,232.564022,0.090548,0.0,27.934229,3.931950,0.090548,1
2,ID_3,27260.125000,27437.0,12189.649414,0.447160,0.011428,0.046402,2.782842,0.359345,1.782842,...,419.781765,3400.0,1.0,233.593529,0.068704,0.0,27.904807,4.085035,0.068704,1
3,ID_4,41938.125000,42138.0,17866.433594,0.426019,0.009908,0.034878,3.060655,0.326727,2.060655,...,439.023968,5424.0,1.0,427.429572,0.078803,0.0,27.870588,4.011726,0.078803,0
4,ID_5,41274.125000,41439.0,14315.041992,0.346828,0.013596,0.065680,2.478506,0.403469,1.478506,...,485.209184,5096.0,1.0,726.731554,0.142608,0.0,28.846909,3.571352,0.142608,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,ID_311,46787.916667,47002.0,18052.070312,0.385828,0.010883,0.042086,2.874885,0.347840,1.874885,...,466.276055,6064.0,1.0,585.547823,0.096561,0.0,28.787507,3.894684,0.096561,0
311,ID_312,8420.354167,8493.0,4292.039795,0.510004,0.016911,0.101797,2.145061,0.466555,1.145061,...,383.044821,987.0,1.0,69.155790,0.070369,0.0,29.625473,4.098452,0.070369,1
312,ID_313,37262.750000,37407.0,13950.793945,0.374390,0.012759,0.057837,2.585819,0.386725,1.585819,...,469.005263,4940.0,1.0,505.566802,0.102341,0.0,26.865256,3.815115,0.102341,0
313,ID_314,25081.833333,25251.0,11689.275391,0.466045,0.011197,0.044546,2.820962,0.354489,1.820962,...,403.597826,2944.0,1.0,215.172554,0.073089,0.0,29.180584,4.055504,0.073089,1


In [4]:
#Checking for nan and inf values:
has_nan = train_df.isna().any().any()
has_inf = train_df.isin([np.inf, -np.inf]).any().any()
print(f'Nan values: {has_nan}')
print(f'Infinite values: {has_inf}')

Nan values: True
Infinite values: True


In [5]:
#Finding out nan and inf columns:
# Columns with NaN values
nan_columns = train_df.columns[train_df.isna().any()]

# Columns with inf or -inf values
inf_columns = train_df.columns[train_df.isin([np.inf, -np.inf]).any()]

columns_with_nan_or_inf = nan_columns.union(inf_columns)

print("Columns with NaN or inf values:", list(columns_with_nan_or_inf))

Columns with NaN or inf values: ['Feature_1712', 'Feature_1713', 'Feature_1714', 'Feature_1715', 'Feature_1716', 'Feature_1717', 'Feature_1718', 'Feature_1719', 'Feature_1720', 'Feature_1721', 'Feature_1722', 'Feature_1723', 'Feature_1724', 'Feature_1725', 'Feature_1726', 'Feature_1727', 'Feature_1728', 'Feature_1729', 'Feature_1730', 'Feature_1731', 'Feature_1732', 'Feature_1733', 'Feature_1734', 'Feature_72', 'Feature_90']


In [6]:
# Replace inf/-inf with NaN
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Replace NaNs with column mean (or median, or 0)
train_df.fillna(train_df.median(numeric_only=True), inplace=True)

In [7]:
# List of columns to exclude
exclude_cols = ['ID', 'CLASS']

# Convert all other columns to float64
train_df = train_df.astype({col: 'float64' for col in train_df.columns if col not in exclude_cols})

In [8]:
train_df.columns[train_df.dtypes != 'float64'] 

Index(['ID', 'CLASS'], dtype='object')

In [9]:
#Handling duplicates:
train_df.drop_duplicates(inplace=True)

In [10]:
#Clipping outliers to 1% and 99% 
numeric_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
# Clip outliers to 1st and 99th percentile
for col in numeric_cols:
    low, high = train_df[col].quantile([0.01, 0.99])
    train_df[col] = train_df[col].clip(low, high)

In [11]:
#Let's split the dataset:
X = train_df.drop(columns=['ID', 'CLASS'])
y = train_df['CLASS']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)

In [12]:
print(X_train.shape,y_train.shape)

(267, 3238) (267,)


In [13]:
#Using Standard Scalar:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [14]:
# Remove features with low variance
selector = VarianceThreshold(threshold=0.01)  # or try 1e-4
X_train = selector.fit_transform(X_train)
print(X_train.shape)

(267, 3110)


In [15]:
#Using PCA dimensionality reduction]
# pca = PCA(n_components=200)
# X_train = pca.fit_transform(X_train)
# print(X_train.shape)

In [16]:
#Select K best features
kselector = SelectKBest(score_func=mutual_info_classif, k=200)  # or choose top N
X_train = kselector.fit_transform(X_train, y_train)
print(X_train.shape)

(267, 200)


In [17]:
#Include polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train = poly.fit_transform(X_train)
print(X_train.shape)

(267, 20100)


In [18]:
model = LogisticRegression(penalty='l2', C=0.01, solver='lbfgs', max_iter=1000)
model.fit(X_train,y_train)

In [19]:
print(X_val.shape, y_val.shape)

(48, 3238) (48,)


In [20]:
#Validation errors:
X_val = scaler.transform(X_val)
X_val = selector.transform(X_val)
# X_val = pca.transform(X_val)
X_val = kselector.transform(X_val)
X_val = poly.transform(X_val)
pred = model.predict(X_val)
print(pred)

[0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0
 1 1 0 1 0 1 0 1 0 0 1]


In [21]:
print(accuracy_score(y_val,pred))

0.4166666666666667


In [22]:
#Test accuracy:
test_df = pd.read_csv('../test_set.csv')

df = test_df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(df.median(numeric_only=True))
df = df.drop_duplicates(subset=['ID'])
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
# Clip outliers to 1st and 99th percentile
for col in numeric_cols:
    low, high = df[col].quantile([0.01, 0.99])
    df[col] = df[col].clip(low, high)

In [23]:
X_test = df.drop(columns=['ID', 'CLASS'])
y_test = df['CLASS']

In [24]:
#Validation errors:
X_test = scaler.transform(X_test)
X_test = selector.transform(X_test)
# X_test = pca.transform(X_test)
X_test = kselector.transform(X_test)
X_test = poly.transform(X_test)
pred = model.predict(X_test)
print(pred)

[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1
 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1
 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 1]


In [25]:
print(accuracy_score(y_test,pred))

0.6
