In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

train = pd.read_csv("../data/aug_train.csv")

print(train.info())
print(train.size)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [3]:
print(train.columns)
print(train.head())
print(train.tail())

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')
   enrollee_id      city  city_development_index gender  \
0         8949  city_103                   0.920   Male   
1        29725   city_40                   0.776   Male   
2        11561   city_21                   0.624    NaN   
3        33241  city_115                   0.789    NaN   
4          666  city_162                   0.767   Male   

       relevent_experience enrolled_university education_level  \
0  Has relevent experience       no_enrollment        Graduate   
1   No relevent experience       no_enrollment        Graduate   
2   No relevent experience    Full time course        Graduate   
3   No relevent experience                 NaN        Graduate   
4  Has relevent experience    

In [4]:
print(train.isnull().sum())

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64


In [5]:
# Fill categorical missing values
categorical_cols = ["gender", "enrolled_university", "education_level", 
                    "major_discipline", "company_size", "company_type"]
for col in categorical_cols:
    train[col].fillna("Unknown", inplace=True)

# Convert 'experience' and 'last_new_job' to numeric
def convert_experience(x):
    if x == ">20":
        return 21
    elif x == "<1":
        return 0
    try:
        return float(x)
    except:
        return np.nan

train["experience"] = train["experience"].apply(convert_experience)
train["last_new_job"] = train["last_new_job"].apply(convert_experience)

# Fill numeric missing values with median
numeric_cols = ["experience", "last_new_job"]
for col in numeric_cols:
    train[col].fillna(train[col].median(), inplace=True)

# Verify
print(train.isnull().sum())

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [6]:
categorical_cols = ["gender", "enrolled_university", "education_level", 
                    "major_discipline", "company_size", "company_type", "relevent_experience", "city"]

for col in categorical_cols:
    print(f"Unique values in {col}:")
    print(train[col].value_counts(dropna=False))
    print("-"*40)

Unique values in gender:
gender
Male       13221
Unknown     4508
Female      1238
Other        191
Name: count, dtype: int64
----------------------------------------
Unique values in enrolled_university:
enrolled_university
no_enrollment       13817
Full time course     3757
Part time course     1198
Unknown               386
Name: count, dtype: int64
----------------------------------------
Unique values in education_level:
education_level
Graduate          11598
Masters            4361
High School        2017
Unknown             460
Phd                 414
Primary School      308
Name: count, dtype: int64
----------------------------------------
Unique values in major_discipline:
major_discipline
STEM               14492
Unknown             2813
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: count, dtype: int64
----------------------------------------
Unique values in company_size:
company_size
Unkno

In [7]:
duplicates = train.duplicated().sum()
print(f"Number of exact duplicate rows: {duplicates}")

Number of exact duplicate rows: 0


In [8]:
# ---------------------------
# Columns
# ---------------------------
onehot_cols = ["gender", "relevent_experience", "enrolled_university", "major_discipline", "company_type"]
ordinal_cols = ["education_level", "company_size"]

# ---------------------------
# One-Hot Encoding (Nominal columns)
# ---------------------------
train_encoded = pd.get_dummies(train, columns=onehot_cols, drop_first=True)

# ---------------------------
# Ordinal Encoding (Columns with natural order)
# ---------------------------
education_order = ["Primary School", "High School", "Graduate", "Masters", "Phd"]
train_encoded["education_level"] = train["education_level"].map({k: i for i, k in enumerate(education_order)})

company_size_order = ["Unknown", "<10", "10-49", "50-99", "100-499", "500-999", 
                      "1000-4999", "5000-9999", "10000+"]
train_encoded["company_size"] = train["company_size"].map({k: i for i, k in enumerate(company_size_order)})

In [9]:
#aug_test.csv
test = pd.read_csv("../data/aug_test.csv")

print(test.info())
print(test.size)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             2129 non-null   int64  
 1   city                    2129 non-null   object 
 2   city_development_index  2129 non-null   float64
 3   gender                  1621 non-null   object 
 4   relevent_experience     2129 non-null   object 
 5   enrolled_university     2098 non-null   object 
 6   education_level         2077 non-null   object 
 7   major_discipline        1817 non-null   object 
 8   experience              2124 non-null   object 
 9   company_size            1507 non-null   object 
 10  company_type            1495 non-null   object 
 11  last_new_job            2089 non-null   object 
 12  training_hours          2129 non-null   int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 216.4+ KB
None
27677


In [10]:
print(test.isnull().sum())

enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64


In [11]:
# Fill categorical missing values
categorical_cols = ["gender", "enrolled_university", "education_level", 
                    "major_discipline", "company_size", "company_type"]
for col in categorical_cols:
    test[col].fillna("Unknown", inplace=True)

# Convert 'experience' and 'last_new_job' to numeric
def convert_experience(x):
    if x == ">20":
        return 21
    elif x == "<1":
        return 0
    try:
        return float(x)
    except:
        return np.nan

test["experience"] = test["experience"].apply(convert_experience)
test["last_new_job"] = test["last_new_job"].apply(convert_experience)

# Fill numeric missing values with median
numeric_cols = ["experience", "last_new_job"]
for col in numeric_cols:
    test[col].fillna(test[col].median(), inplace=True)

# Verify
print(test.isnull().sum())


enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [12]:
# ---------------------------
# Columns
# ---------------------------
onehot_cols = ["gender", "relevent_experience", "enrolled_university", "major_discipline", "company_type"]
ordinal_cols = ["education_level", "company_size"]

# ---------------------------
# One-Hot Encoding (Nominal columns)
# ---------------------------
test_encoded = pd.get_dummies(test, columns=onehot_cols, drop_first=True)

# ---------------------------
# Ordinal Encoding (Columns with natural order)
# ---------------------------
education_order = ["Primary School", "High School", "Graduate", "Masters", "Phd"]
test_encoded["education_level"] = test["education_level"].map({k: i for i, k in enumerate(education_order)})

company_size_order = ["Unknown", "<10", "10-49", "50-99", "100-499", "500-999", 
                      "1000-4999", "5000-9999", "10000+"]
test_encoded["company_size"] = test["company_size"].map({k: i for i, k in enumerate(company_size_order)})

In [13]:
# ============================
# LOGISTIC REGRESSION (BASELINE)
# ============================

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ----------------------------
# Feature / Target split
# ----------------------------
X = train_encoded.drop(["target", "enrollee_id", "city"], axis=1)
y = train_encoded["target"]

# Align test columns (DO NOT generate predictions yet)
X_test = test_encoded.reindex(columns=X.columns, fill_value=0)

# Handle missing values
X = X.fillna(X.median())
X_test = X_test.fillna(X.median())  # avoid leakage

# ----------------------------
# Train / Validation split
# ----------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ----------------------------
# Train Logistic Regression
# ----------------------------
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_tr, y_tr)

# ----------------------------
# Validation predictions
# ----------------------------
log_val_preds = log_model.predict(X_val)

# ----------------------------
# Evaluation (THIS is what matters now)
# ----------------------------
print("Logistic Regression Performance (Validation Set)")
print("Accuracy :", accuracy_score(y_val, log_val_preds))
print("Precision:", precision_score(y_val, log_val_preds))
print("Recall   :", recall_score(y_val, log_val_preds))
print("F1 Score :", f1_score(y_val, log_val_preds))


Logistic Regression Performance (Validation Set)
Accuracy : 0.7716597077244259
Precision: 0.5854700854700855
Recall   : 0.2869109947643979
F1 Score : 0.38510189739985945


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# ============================
# MODEL 2: RANDOM FOREST
# ============================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_tr, y_tr)

rf_val_preds = rf_model.predict(X_val)

print("Random Forest Performance (Validation Set)")
print("Accuracy :", accuracy_score(y_val, rf_val_preds))
print("Precision:", precision_score(y_val, rf_val_preds))
print("Recall   :", recall_score(y_val, rf_val_preds))
print("F1 Score :", f1_score(y_val, rf_val_preds))


Random Forest Performance (Validation Set)
Accuracy : 0.7794885177453027
Precision: 0.5654761904761905
Recall   : 0.4973821989528796
F1 Score : 0.5292479108635098


In [15]:
# ============================
# MODEL 3: XGBOOST
# ============================

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1
)

# Train on training split
xgb_model.fit(X_tr, y_tr)

# Validation predictions
xgb_val_preds = xgb_model.predict(X_val)

# Evaluation
print("XGBoost Performance (Validation Set)")
print("Accuracy :", accuracy_score(y_val, xgb_val_preds))
print("Precision:", precision_score(y_val, xgb_val_preds))
print("Recall   :", recall_score(y_val, xgb_val_preds))
print("F1 Score :", f1_score(y_val, xgb_val_preds))


XGBoost Performance (Validation Set)
Accuracy : 0.7969728601252609
Precision: 0.5954692556634305
Recall   : 0.5780104712041885
F1 Score : 0.5866099893730075


In [16]:
# ============================
# FINAL MODEL (XGBOOST)
# ============================

# Retrain XGBoost on FULL training data
best_model = xgb_model
best_model.fit(X, y)


# ============================
# GENERATE FINAL PREDICTIONS
# ============================

final_test_preds = best_model.predict(X_test)

final_predictions = pd.DataFrame({
    "enrollee_id": test["enrollee_id"],
    "prediction": final_test_preds
})


# ============================
# SAVE OUTPUT
# ============================

import os
os.makedirs("outputs", exist_ok=True)

final_predictions.to_csv("outputs/final_predictions.csv", index=False)

print("✅ Final predictions saved to outputs/final_predictions.csv")
final_predictions.head()


✅ Final predictions saved to outputs/final_predictions.csv


Unnamed: 0,enrollee_id,prediction
0,32403,0
1,9858,0
2,31806,0
3,27385,0
4,27724,0
