In [1]:
#Importing Necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [3]:
# Loading the datasets
train_features = pd.read_csv("C:\\Users\\HP\\Downloads\\train.csv")
train_labels = pd.read_csv("C:\\Users\\HP\\Downloads\\train_labels.csv")
test_features = pd.read_csv("C:\\Users\\HP\\Downloads\\test.csv")
sample_submission = pd.read_csv("C:\\Users\\HP\\Downloads\\sample_submission (1).csv")


In [4]:
# Cleaning Column Names
train_labels.columns = train_labels.columns.str.strip()

# Merging Labeled features with labels
labeled = pd.merge(train_labels, train_features, on="Id")

# Spliting into features and taregt variable
X_labeled = labeled.drop(columns=["Id", "Class_y"])
y_labeled = labeled["Class_y"]

In [5]:
# For the unlabeled data
unlabeled = train_features[~train_features["Id"].isin(train_labels["Id"])]
X_unlabeled = unlabeled.drop(columns=["Id"])

In [6]:
# Removing label like columns
label_cols = ['Class', 'Class_x', 'Class_y']
X_labeled = X_labeled.drop(columns=[col for col in label_cols if col in X_labeled.columns])
X_unlabeled = X_unlabeled.drop(columns=[col for col in label_cols if col in X_unlabeled.columns])


In [7]:
# For handling missing values using mean imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_labeled_filled = pd.DataFrame(imputer.fit_transform(X_labeled), columns=X_labeled.columns)
X_unlabeled_filled = pd.DataFrame(imputer.transform(X_unlabeled), columns=X_unlabeled.columns)


In [8]:
# Aligning feature columns between labeled and unlabeled datasets
X_unlabeled_filled = X_unlabeled_filled[X_labeled_filled.columns]

# Selecting top 500 features using Classifier and mutual information
selector = SelectKBest(mutual_info_classif, k=500)
X_labeled_sel = selector.fit_transform(X_labeled_filled, y_labeled)
X_unlabeled_sel = selector.transform(X_unlabeled_filled)

In [9]:
# Sacling the features for getting the mean to 0 and variance to 1
scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled_sel)
X_unlabeled_scaled = scaler.transform(X_unlabeled_sel)

In [10]:
# Training the labeled data using xgboost classifer
xgb_base = xgb.XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss')
xgb_base.fit(X_labeled_scaled, y_labeled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
# Generating pseudo-labels for unlabeled data
pseudo_probs = xgb_base.predict_proba(X_unlabeled_scaled)
pseudo_preds = xgb_base.predict(X_unlabeled_scaled)
conf_mask = np.max(pseudo_probs, axis=1) >= 0.95  # only retain predictions with >95% confidence


In [12]:
# Selecting confident pseudo-Labeled Data
X_pseudo = X_unlabeled_scaled[conf_mask]
y_pseudo = pseudo_preds[conf_mask]

# Combining labeled and pseudo unlabeled data
X_comb = np.vstack([X_labeled_scaled, X_pseudo])
y_comb = pd.concat([y_labeled, pd.Series(y_pseudo)], ignore_index=True)


In [13]:
# The final ensemble models
model_rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
model_xgb = xgb.XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss')
model_lr = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, class_weight='balanced')


In [14]:
# Cross validation for evaulating ensemble methods
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1s = []

In [15]:
for train_idx, val_idx in skf.split(X_comb, y_comb):
    X_train, X_val = X_comb[train_idx], X_comb[val_idx]
    y_train, y_val = y_comb.iloc[train_idx], y_comb.iloc[val_idx]
    
    # Training all the three models
    model_rf.fit(X_train, y_train)
    model_xgb.fit(X_train, y_train)
    model_lr.fit(X_train, y_train)
    
    # Obtaining prediction probabilities from each model
    probs_rf = model_rf.predict_proba(X_val)
    probs_xgb = model_xgb.predict_proba(X_val)
    probs_lr = model_lr.predict_proba(X_val)
    
    # Average the probabilities and predicting the class
    avg_probs = (probs_rf + probs_xgb + probs_lr) / 3
    y_pred = np.argmax(avg_probs, axis=1)
    
    # for macro f1 score
    score = f1_score(y_val, y_pred, average='macro')
    f1s.append(score)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [16]:
# The cross validated scores
print("Cross-validated Macro F1 Scores:", f1s)
print("Average Ensemble Macro F1 Score:", np.mean(f1s))


Cross-validated Macro F1 Scores: [0.973103448275862, 1.0, 1.0, 1.0, 1.0]
Average Ensemble Macro F1 Score: 0.9946206896551724


In [17]:
# Retraining the final models on the data for test prediction
model_rf.fit(X_comb, y_comb)
model_xgb.fit(X_comb, y_comb)
model_lr.fit(X_comb, y_comb)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Preprocessing test data
X_test = test_features.drop(columns=["Id"])
X_test_filled = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
X_test_sel = selector.transform(X_test_filled)
X_test_scaled = scaler.transform(X_test_sel)
