In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

train = pd.read_csv("data/aug_train.csv")
print(train.info())
print(train.size)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [2]:
print(train.columns)
print(train.head())
print(train.tail())

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')
   enrollee_id      city  city_development_index gender  \
0         8949  city_103                   0.920   Male   
1        29725   city_40                   0.776   Male   
2        11561   city_21                   0.624    NaN   
3        33241  city_115                   0.789    NaN   
4          666  city_162                   0.767   Male   

       relevent_experience enrolled_university education_level  \
0  Has relevent experience       no_enrollment        Graduate   
1   No relevent experience       no_enrollment        Graduate   
2   No relevent experience    Full time course        Graduate   
3   No relevent experience                 NaN        Graduate   
4  Has relevent experience    

In [3]:
print(train.isnull().sum())

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64


In [4]:
# Fill categorical missing values
categorical_cols = ["gender", "enrolled_university", "education_level", 
                    "major_discipline", "company_size", "company_type"]
for col in categorical_cols:
    train[col].fillna("Unknown", inplace=True)

# Convert 'experience' and 'last_new_job' to numeric
def convert_experience(x):
    if x == ">20":
        return 21
    elif x == "<1":
        return 0
    try:
        return float(x)
    except:
        return np.nan

train["experience"] = train["experience"].apply(convert_experience)
train["last_new_job"] = train["last_new_job"].apply(convert_experience)

# Fill numeric missing values with median
numeric_cols = ["experience", "last_new_job"]
for col in numeric_cols:
    train[col].fillna(train[col].median(), inplace=True)

# Verify
print(train.isnull().sum())

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [5]:
categorical_cols = ["gender", "enrolled_university", "education_level", 
                    "major_discipline", "company_size", "company_type", "relevent_experience", "city"]

for col in categorical_cols:
    print(f"Unique values in {col}:")
    print(train[col].value_counts(dropna=False))
    print("-"*40)

Unique values in gender:
gender
Male       13221
Unknown     4508
Female      1238
Other        191
Name: count, dtype: int64
----------------------------------------
Unique values in enrolled_university:
enrolled_university
no_enrollment       13817
Full time course     3757
Part time course     1198
Unknown               386
Name: count, dtype: int64
----------------------------------------
Unique values in education_level:
education_level
Graduate          11598
Masters            4361
High School        2017
Unknown             460
Phd                 414
Primary School      308
Name: count, dtype: int64
----------------------------------------
Unique values in major_discipline:
major_discipline
STEM               14492
Unknown             2813
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: count, dtype: int64
----------------------------------------
Unique values in company_size:
company_size
Unkno

In [6]:
duplicates = train.duplicated().sum()
print(f"Number of exact duplicate rows: {duplicates}")

Number of exact duplicate rows: 0


In [7]:
# ---------------------------
# Columns
# ---------------------------
onehot_cols = ["gender", "relevent_experience", "enrolled_university", "major_discipline", "company_type"]
ordinal_cols = ["education_level", "company_size"]

# ---------------------------
# One-Hot Encoding (Nominal columns)
# ---------------------------
train_encoded = pd.get_dummies(train, columns=onehot_cols, drop_first=True)

# ---------------------------
# Ordinal Encoding (Columns with natural order)
# ---------------------------
education_order = ["Primary School", "High School", "Graduate", "Masters", "Phd"]
train_encoded["education_level"] = train["education_level"].map({k: i for i, k in enumerate(education_order)})

company_size_order = ["Unknown", "<10", "10-49", "50-99", "100-499", "500-999", 
                      "1000-4999", "5000-9999", "10000+"]
train_encoded["company_size"] = train["company_size"].map({k: i for i, k in enumerate(company_size_order)})

In [None]:
#aug_test.csv
test = pd.read_csv("data/aug_test.csv")
print(test.info())
print(test.size)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             2129 non-null   int64  
 1   city                    2129 non-null   object 
 2   city_development_index  2129 non-null   float64
 3   gender                  1621 non-null   object 
 4   relevent_experience     2129 non-null   object 
 5   enrolled_university     2098 non-null   object 
 6   education_level         2077 non-null   object 
 7   major_discipline        1817 non-null   object 
 8   experience              2124 non-null   object 
 9   company_size            1507 non-null   object 
 10  company_type            1495 non-null   object 
 11  last_new_job            2089 non-null   object 
 12  training_hours          2129 non-null   int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 216.4+ KB
None
27677


In [9]:
print(test.isnull().sum())

enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64


In [10]:
# Fill categorical missing values
categorical_cols = ["gender", "enrolled_university", "education_level", 
                    "major_discipline", "company_size", "company_type"]
for col in categorical_cols:
    test[col].fillna("Unknown", inplace=True)

# Convert 'experience' and 'last_new_job' to numeric
def convert_experience(x):
    if x == ">20":
        return 21
    elif x == "<1":
        return 0
    try:
        return float(x)
    except:
        return np.nan

test["experience"] = test["experience"].apply(convert_experience)
test["last_new_job"] = test["last_new_job"].apply(convert_experience)

# Fill numeric missing values with median
numeric_cols = ["experience", "last_new_job"]
for col in numeric_cols:
    test[col].fillna(test[col].median(), inplace=True)

# Verify
print(test.isnull().sum())


enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [11]:
# ---------------------------
# Columns
# ---------------------------
onehot_cols = ["gender", "relevent_experience", "enrolled_university", "major_discipline", "company_type"]
ordinal_cols = ["education_level", "company_size"]

# ---------------------------
# One-Hot Encoding (Nominal columns)
# ---------------------------
test_encoded = pd.get_dummies(test, columns=onehot_cols, drop_first=True)

# ---------------------------
# Ordinal Encoding (Columns with natural order)
# ---------------------------
education_order = ["Primary School", "High School", "Graduate", "Masters", "Phd"]
test_encoded["education_level"] = test["education_level"].map({k: i for i, k in enumerate(education_order)})

company_size_order = ["Unknown", "<10", "10-49", "50-99", "100-499", "500-999", 
                      "1000-4999", "5000-9999", "10000+"]
test_encoded["company_size"] = test["company_size"].map({k: i for i, k in enumerate(company_size_order)})

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train = train_encoded.drop(["target", "enrollee_id", "city"], axis=1)
Y_train = train_encoded["target"]

#Align the columns of test_encoded dataframe with train_encoded dataframe
X_test = test_encoded.reindex(columns=X_train.columns, fill_value=0)

X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())  # use train median to avoid data leakage

# Start Logistics Regression
model = LogisticRegression(max_iter=1000,random_state=42)
model.fit(X_train,Y_train)

#Predict the test dataset
Y_pred = model.predict(X_test)

# ---------------------------
# 4. (Optional) Check performance on train data
# ---------------------------
train_pred = model.predict(X_train)
print("Training Accuracy:", accuracy_score(Y_train, train_pred))
print("\nClassification Report:\n", classification_report(Y_train, train_pred))

# ---------------------------
# 5. Save predictions to CSV
# ---------------------------
final_predictions = pd.DataFrame({
    "id": test["enrollee_id"],   # Assuming enrollee_id uniquely identifies rows
    "prediction": Y_pred
})

# Save to CSV
final_predictions.to_csv("final_predictions.csv", index=False)

print("✅ Final predictions saved to final_predictions.csv")
print(final_predictions.head())


Training Accuracy: 0.7691303893934649

Classification Report:
               precision    recall  f1-score   support

         0.0       0.79      0.94      0.86     14381
         1.0       0.58      0.26      0.36      4777

    accuracy                           0.77     19158
   macro avg       0.69      0.60      0.61     19158
weighted avg       0.74      0.77      0.74     19158

✅ Final predictions saved to final_predictions.csv
      id  prediction
0  32403         0.0
1   9858         0.0
2  31806         0.0
3  27385         0.0
4  27724         0.0


In [None]:
print(train_encoded.columns)
print(Y_train.shape)
print(Y_train.unique())


Index(['enrollee_id', 'city', 'city_development_index', 'education_level',
       'experience', 'company_size', 'last_new_job', 'training_hours',
       'target', 'gender_Male', 'gender_Other', 'gender_Unknown',
       'relevent_experience_No relevent experience',
       'enrolled_university_Part time course', 'enrolled_university_Unknown',
       'enrolled_university_no_enrollment', 'major_discipline_Business Degree',
       'major_discipline_Humanities', 'major_discipline_No Major',
       'major_discipline_Other', 'major_discipline_STEM',
       'major_discipline_Unknown', 'company_type_Funded Startup',
       'company_type_NGO', 'company_type_Other', 'company_type_Public Sector',
       'company_type_Pvt Ltd', 'company_type_Unknown'],
      dtype='object')
(19158,)
[1. 0.]
