# Import the libraries

In [19]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump, load

# Get the dataset and analyze it

In [6]:
dataset = pd.read_csv('train.csv')

In [7]:
dataset.head()

Unnamed: 0,Index,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
0,1,Sara,Mcguire,Female,tsharp@example.net,(971)643-6089x9160,17-08-21,"Editor, commissioning"
1,2,Alisha,Hebert,Male,vincentgarrett@example.net,+1-114-355-1841x78347,28-06-69,Broadcast engineer
2,3,Gwendolyn,Sheppard,Male,mercadojonathan@example.com,9017807728,25-09-15,Industrial buyer
3,4,Kristine,Mccann,Female,lindsay55@example.com,+1-607-333-9911x59088,27-07-78,Multimedia specialist
4,5,Bobby,Pittman,Female,blevinsmorgan@example.com,3739847538,17-11-89,Planning and development surveyor


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Index          10000 non-null  int64 
 1   First Name     10000 non-null  object
 2   Last Name      10000 non-null  object
 3   Sex            10000 non-null  object
 4   Email          10000 non-null  object
 5   Phone          10000 non-null  object
 6   Date of birth  10000 non-null  object
 7   Job Title      10000 non-null  object
dtypes: int64(1), object(7)
memory usage: 625.1+ KB


In [10]:
dataset.isnull().sum()

Index            0
First Name       0
Last Name        0
Sex              0
Email            0
Phone            0
Date of birth    0
Job Title        0
dtype: int64

# Clean the dataset

In [11]:
dataset = dataset.drop(['Index', 'First Name', 'Last Name', 'Email', 'Phone'], axis=1)
dataset.head()

Unnamed: 0,Sex,Date of birth,Job Title
0,Female,17-08-21,"Editor, commissioning"
1,Male,28-06-69,Broadcast engineer
2,Male,25-09-15,Industrial buyer
3,Female,27-07-78,Multimedia specialist
4,Female,17-11-89,Planning and development surveyor


## Clean the dataset

In [12]:
dataset['Date of birth'] = pd.to_datetime(dataset['Date of birth'], errors='coerce')
dataset['Age'] = 2025 - dataset['Date of birth'].dt.year
dataset.drop('Date of birth', axis=1, inplace=True)
dataset.head()

Unnamed: 0,Sex,Job Title,Age
0,Female,"Editor, commissioning",4
1,Male,Broadcast engineer,-44
2,Male,Industrial buyer,10
3,Female,Multimedia specialist,47
4,Female,Planning and development surveyor,36


In [14]:
dataset['Target'] = dataset['Job Title'].apply(lambda x: 1 if 'Manager' in x else 0)
dataset.head()

Unnamed: 0,Sex,Job Title,Age,Target
0,Female,"Editor, commissioning",4,0
1,Male,Broadcast engineer,-44,0
2,Male,Industrial buyer,10,0
3,Female,Multimedia specialist,47,0
4,Female,Planning and development surveyor,36,0


In [None]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

# Split the dataset into train and test splits

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
oh_encoder = OneHotEncoder(handle_unknown='ignore')
X[['Sex', 'Job Title']] = oh_encoder.fit_transform()

Unnamed: 0,Sex,Job Title
0,Female,"Editor, commissioning"
1,Male,Broadcast engineer
2,Male,Industrial buyer
3,Female,Multimedia specialist
4,Female,Planning and development surveyor
...,...,...
9995,Male,"Scientist, physiological"
9996,Female,Warehouse manager
9997,Female,Lawyer
9998,Male,Accounting technician
