<a href="https://colab.research.google.com/github/abakm/AL-ML_Assignment-1/blob/master/IntermediateCodingAssessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
# import statements
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [42]:
# Load Dataset
train_df = pd.read_csv('./train_LZdllcl.csv')
test_df =pd.read_csv('./test_2umaH9m.csv')
sub_df = pd.read_csv('./sample_submission_M0L0uXE.csv')


print(train_df.head())
print(test_df.head())

   employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   35                   5.0   
1               other                1   30                   5.0   
2            sourcing                1   34                   3.0   
3               other                2   39                   1.0   
4               other                1   45                   3.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0                  8              1            0                  49   
1 

In [43]:
def create_map_dict(series, start_value=0):
    """Create a mapping dictionary from unique values to integers"""
    unique_values = series.dropna().unique()  # Remove NaN values
    return {value: idx + start_value for idx, value in enumerate(sorted(unique_values))}

In [44]:
# exploratory Analysis
print(f"Dataset Shape: {train_df.shape}")
print(f"Number of Rows: {train_df.shape[0]}")
print(f"Number of Columns: {train_df.shape[1]}")
print()

print("Column Names and Data Types:")
print(train_df.dtypes)
print()

print("First 5 rows:")
print(train_df.head())
print()

print("Last 5 rows:")
print(train_df.tail())
print()

print("Dataset Info:")
print(train_df.info())

Dataset Shape: (54808, 14)
Number of Rows: 54808
Number of Columns: 14

Column Names and Data Types:
employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

First 5 rows:
   employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bache

In [45]:
# creare mappings
combine_df = pd.concat([train_df, test_df], axis=0)
combine_df.dropna(inplace=True)
combine_df.drop_duplicates(inplace=True)

genders = create_map_dict(combine_df['gender'])
departments = create_map_dict(combine_df['department'])
regions = create_map_dict(combine_df['region'])
educations = create_map_dict(combine_df['education'])
recruitment_channels = create_map_dict(combine_df['recruitment_channel'])

In [46]:

# checking for null values
print(train_df.isnull().sum())


# Replace null values with mode values
train_df['education'] = train_df['education'].fillna(combine_df['education'].mode()[0])
train_df['previous_year_rating'] = train_df['previous_year_rating'].fillna(combine_df['previous_year_rating'].mode()[0])
train_df.drop_duplicates(inplace=True)



employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64


In [47]:

# checking for null values
print(test_df.isnull().sum())

# Replace null values with mode values
test_df['education'] = test_df['education'].fillna(combine_df['education'].mode()[0])
test_df['previous_year_rating'] = test_df['previous_year_rating'].fillna(combine_df['previous_year_rating'].mode()[0])
test_df.drop_duplicates(inplace=True)


employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64


In [48]:
# Maping



train_df['gender'] = train_df['gender'].map(genders)
train_df['department'] = train_df['department'].map(departments)
train_df['region'] = train_df['region'].map(regions)
train_df['education'] = train_df['education'].map(educations)
train_df['recruitment_channel'] = train_df['recruitment_channel'].map(recruitment_channels)


test_df['gender'] = test_df['gender'].map(genders)
test_df['department'] = test_df['department'].map(departments)
test_df['region'] = test_df['region'].map(regions)
test_df['education'] = test_df['education'].map(educations)
test_df['recruitment_channel'] = test_df['recruitment_channel'].map(recruitment_channels)


In [49]:
train_x = train_df.drop(["employee_id", "is_promoted"], axis=1)
train_y = train_df["is_promoted"]

train_x.shape

(54808, 12)

In [50]:
parameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 50, 60],
    "criterion": ["gini", "entropy"],
}


random_search = RandomizedSearchCV(RandomForestClassifier(), parameters, n_iter=18, cv=5, scoring="accuracy", verbose=1, n_jobs=1)
random_search.fit(train_x, train_y)

best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

n_estimators = best_params['n_estimators']
max_depth = best_params['max_depth']
criterion = best_params['criterion']

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'n_estimators': 200, 'max_depth': 60, 'criterion': 'entropy'}
Best Score: 0.9357028329579915


In [51]:
# create model
rc = RandomForestClassifier(criterion=criterion, max_depth=max_depth,n_estimators=n_estimators)

In [52]:
# fit the model
model = rc.fit(train_x, train_y)

In [53]:
# Predict the model
sub_rows = []
for employee_id in sub_df['employee_id']:
  row = test_df[test_df['employee_id'] == employee_id].drop(["employee_id"], axis=1)
  if row.empty:
    row = train_df[train_df['employee_id'] == employee_id].drop(["employee_id", "is_promoted"], axis=1)
  if row.empty:
    continue
  print(model.predict(row))
  sub_rows.append(dict(employee_id=employee_id, is_promoted=model.predict(row)[0]))
predict = pd.DataFrame(sub_rows)
predict.to_csv('submission.csv', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]