In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from clean_data import clean_data


df = pd.read_csv('./dataset.csv')
df.head()

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target
0,5008804,1,1,1,1,0,0,0,0,2,15,427500.0,32.868574,12.435574,Working,Higher education,Civil marriage,Rented apartment,Other,1
1,5008806,1,1,1,0,0,0,0,0,2,29,112500.0,58.793815,3.104787,Working,Secondary / secondary special,Married,House / apartment,Security staff,0
2,5008808,0,0,1,0,1,1,0,0,1,4,270000.0,52.321403,8.353354,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,0
3,5008812,0,0,1,0,0,0,1,0,1,20,283500.0,61.504343,0.0,Pensioner,Higher education,Separated,House / apartment,Other,0
4,5008815,1,1,1,1,1,1,0,0,2,5,270000.0,46.193967,2.10545,Working,Higher education,Married,House / apartment,Accountants,0


In [2]:
df['Occupation_type'].value_counts()

Occupation_type
Other                    2994
Laborers                 1724
Sales staff               959
Core staff                877
Managers                  782
Drivers                   623
High skill tech staff     357
Accountants               300
Medicine staff            291
Cooking staff             193
Security staff            182
Cleaning staff            146
Private service staff      86
Low-skill Laborers         53
Secretaries                46
Waiters/barmen staff       40
HR staff                   22
IT staff                   18
Realty agents              16
Name: count, dtype: int64

In [3]:
columns_to_encode = ["Income_type",
                     "Education_type",
                     "Family_status",
                     "Housing_type",
                     "Occupation_type"]

# LabelEncoder
# Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# Create an instance of the label encoder
le = LabelEncoder()

# Copy car_data
#df_encoded = df.copy()

# Fit and transform the label encoder for each column
for column in columns_to_encode:
    df[column] = le.fit_transform(df[column])

df.head(20)

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target
0,5008804,1,1,1,1,0,0,0,0,2,15,427500.0,32.868574,12.435574,4,1,0,4,12,1
1,5008806,1,1,1,0,0,0,0,0,2,29,112500.0,58.793815,3.104787,4,4,1,1,17,0
2,5008808,0,0,1,0,1,1,0,0,1,4,270000.0,52.321403,8.353354,0,4,3,1,15,0
3,5008812,0,0,1,0,0,0,1,0,1,20,283500.0,61.504343,0.0,1,1,2,1,12,0
4,5008815,1,1,1,1,1,1,0,0,2,5,270000.0,46.193967,2.10545,4,1,1,1,0,0
5,5008819,1,1,1,0,0,0,0,0,2,17,135000.0,48.674511,3.269061,0,4,1,1,8,0
6,5008825,0,1,0,0,0,0,0,0,2,25,130500.0,29.21073,3.019911,4,2,1,1,0,1
7,5008830,0,0,1,0,1,0,0,0,2,31,157500.0,27.463945,4.021985,4,4,1,1,8,1
8,5008834,0,0,1,0,0,0,0,1,2,44,112500.0,30.029364,4.435409,4,4,3,1,12,0
9,5008836,1,1,1,0,0,0,0,3,5,24,270000.0,34.741302,3.184186,4,4,1,1,8,0


In [4]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [5]:
label_mapping

{'Accountants': 0,
 'Cleaning staff': 1,
 'Cooking staff': 2,
 'Core staff': 3,
 'Drivers': 4,
 'HR staff': 5,
 'High skill tech staff': 6,
 'IT staff': 7,
 'Laborers': 8,
 'Low-skill Laborers': 9,
 'Managers': 10,
 'Medicine staff': 11,
 'Other': 12,
 'Private service staff': 13,
 'Realty agents': 14,
 'Sales staff': 15,
 'Secretaries': 16,
 'Security staff': 17,
 'Waiters/barmen staff': 18}

In [6]:

scaled = StandardScaler().fit_transform(df[['Total_income','Age','Years_employed']])
scaled

array([[ 2.48077331, -0.93895577,  1.06763396],
       [-0.69232071,  1.29114011, -0.40365469],
       [ 0.8942263 ,  0.73438157,  0.42394517],
       ...,
       [-0.91897028,  0.73226193, -0.15023731],
       [-0.23902156, -0.84898886, -0.32119691],
       [-0.69232071, -1.60240281, -0.37818344]])

In [7]:
# drop the unique ID column
df = df.drop(columns=['ID','Total_income', 'Age','Years_employed'])
scaled_df = pd.DataFrame(scaled, columns=['Total_income', 'Age', 'Years_employed'])
df = pd.concat([df,scaled_df], axis=1)
df.head()

Unnamed: 0,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target,Total_income,Age,Years_employed
0,1,1,1,1,0,0,0,0,2,15,4,1,0,4,12,1,2.480773,-0.938956,1.067634
1,1,1,1,0,0,0,0,0,2,29,4,4,1,1,17,0,-0.692321,1.29114,-0.403655
2,0,0,1,0,1,1,0,0,1,4,0,4,3,1,15,0,0.894226,0.734382,0.423945
3,0,0,1,0,0,0,1,0,1,20,1,1,2,1,12,0,1.030216,1.5243,-0.893221
4,1,1,1,1,1,1,0,0,2,5,4,1,1,1,0,0,0.894226,0.207298,-0.561231
