In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_sample_weight



In [24]:
from google.colab import drive
drive.mount('/content/drive')
base_path = "/content/drive/MyDrive/Colab Notebooks"

# X is features y is prediction (This for me to remember)
# Set pathing for base and validation sets (CHANGE THESE TO YOUR PATHINGS)
pa_path = f"{base_path}/project_adult.csv"
pvi_path = f"{base_path}/project_validation_inputs.csv"
output_path = f"{base_path}/Group_01_MLP_PredictedOutputs.csv"

# Read the files in
pa = pd.read_csv(pa_path)
pvi = pd.read_csv(pvi_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
def preprocess_data_label(df):
    # Drop unnamed col
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')

    # Handle missing values
    df = df.dropna()
    df = df[~df.isin(['?']).any(axis=1)]

    # Separate categorical and numeric columns
    cat_cols = df.select_dtypes(include=['object']).columns
    income_col = df['income'].copy()
    cat_cols = cat_cols.drop('income')
    num_cols = df.select_dtypes(exclude=['object']).columns

    # Encode target
    income_col = income_col.map({'<=50K': 0, '>50K': 1})

    # Fit encoders for categorical features
    encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le

    # Fit scaler for numeric features
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    # Define feature order explicitly
    feature_order = num_cols.tolist() + cat_cols.tolist()

    # Final DataFrame (with income at the end)
    df = pd.concat([df[feature_order], income_col.rename('income')], axis=1)

    X = df[feature_order].values
    y = df['income'].values.ravel()

    # Return features, target, transformers, and the column order
    return X, y, encoders, scaler, feature_order, df


def preprocess_data_validation_label(df, encoders, scaler, feature_order):
    # Drop unnamed col
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')

    # Handle missing values
    df = df.dropna()
    df = df[~df.isin(['?']).any(axis=1)]

    # Separate categorical and numeric columns
    cat_cols = [col for col in df.select_dtypes(include=['object']).columns if col in encoders]
    num_cols = df.select_dtypes(exclude=['object']).columns

    # Use existing encoders
    for col in cat_cols:
        le = encoders[col]
        df[col] = df[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

    # Use the same scaler
    df[num_cols] = scaler.transform(df[num_cols])

    X_val_df = df[feature_order]
    # Reorder columns to match training
    X_val = X_val_df.values

    return X_val, X_val_df


In [26]:
# # Apply preprocessing

# Training set
X_label, y_label, encoders, scaler, feature_order, test1 = preprocess_data_label(pa)

# Validation/Test set
X_label_validation, test2 = preprocess_data_validation_label(pvi, encoders, scaler, feature_order)

# Split training and test and stratify on y to make sure the split is true and representitive (Would of done cv for algo approach but didnt ): )
X_train, X_test, y_train, y_test = train_test_split(
    X_label, y_label,
    test_size=0.2,
    random_state=42,
    stratify=y_label
)

In [27]:
print(test1.columns, test2.columns)

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass', 'education', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'native-country',
       'income'],
      dtype='object') Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass', 'education', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'native-country'],
      dtype='object')


In [48]:
param_grid = {
    "hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64)],
    #"hidden_layer_sizes": [(64,)],
    "activation": ["relu", "tanh"],
    #"activation": ["tanh"],
    "alpha": [1e-4, 1e-3, 1e-2],
    #"alpha": [1e-4],
    "learning_rate_init": [1e-3, 10e-3, 14e-3],
    "solver": ["adam"],
    "max_iter": [300],
    "early_stopping": [True],
    "random_state": [42],
}

clf = MLPClassifier()

gs = GridSearchCV(clf, param_grid=param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

best_model = gs.best_estimator_
print("Best params:", gs.best_params_)

print("Iterations used:", best_model.n_iter_)
print("Did Early Stopping trigger?:", best_model._no_improvement_count > 0)
print("Loss after last iteration:", best_model.loss_)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params: {'activation': 'tanh', 'alpha': 0.01, 'early_stopping': True, 'hidden_layer_sizes': (64, 32), 'learning_rate_init': 0.01, 'max_iter': 300, 'random_state': 42, 'solver': 'adam'}
Iterations used: 21
Did Early Stopping trigger?: True
Loss after last iteration: 0.33458600251301657


In [49]:
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

Test Accuracy: 0.8418874172185431
Confusion Matrix:
 [[3353  277]
 [ 487  715]]
              precision    recall  f1-score   support

           0      0.873     0.924     0.898      3630
           1      0.721     0.595     0.652      1202

    accuracy                          0.842      4832
   macro avg      0.797     0.759     0.775      4832
weighted avg      0.835     0.842     0.837      4832



In [50]:
final_model = gs.best_estimator_
final_model.fit(X_label, y_label)

In [51]:
# Vorhersage {0,1}
y_pred_val_01 = final_model.predict(X_label_validation)

# Abbildung auf {1, -1}
y_pred_val_pm1 = np.where(y_pred_val_01 == 1, 1, -1)

group_num = "01"  # <- Gruppennummer einsetzen
out_csv = f"{output_path}\\Group_{group_num}_MLP_PredictedOutputs.csv"
pd.DataFrame({"prediction": y_pred_val_pm1}).to_csv(out_csv, index=False)
print("Saved:", out_csv)

Saved: /content/drive/MyDrive/Colab Notebooks/Group_01_MLP_PredictedOutputs.csv\Group_01_MLP_PredictedOutputs.csv
