## Exploracion Inicial

In [35]:

import pandas as pd

# Exploracion inicial de datos
df = pd.read_csv('employee-attrition-dataset.csv')
df.head()

print(df.isna().sum().sort_values())  # Check for missing values
print(df.dtypes) # Check data types of each column
print(df.describe())  # Get a statistical summary of the dataset

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

## Preprocesamiento

In [39]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Check for numeric columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Check for categorical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

# Only get cat_cols
df[cat_cols].head()

# Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df = df.astype(int)

# Display the first few rows of the modified DataFrame
# print(df.head())
print(df.describe())  # Get a statistical summary of the modified dataset
# df.head()

X = df.drop('Attrition_Yes', axis=1)
y = df['Attrition_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the first few rows of the modified DataFrame
print(df.head())

               Age    DailyRate  DistanceFromHome    Education  EmployeeCount  \
count  1470.000000  1470.000000       1470.000000  1470.000000         1470.0   
mean     36.923810   802.485714          9.192517     2.912925            1.0   
std       9.135373   403.509100          8.106864     1.024165            0.0   
min      18.000000   102.000000          1.000000     1.000000            1.0   
25%      30.000000   465.000000          2.000000     2.000000            1.0   
50%      36.000000   802.000000          7.000000     3.000000            1.0   
75%      43.000000  1157.000000         14.000000     4.000000            1.0   
max      60.000000  1499.000000         29.000000     5.000000            1.0   

       EmployeeNumber  EnvironmentSatisfaction   HourlyRate  JobInvolvement  \
count     1470.000000              1470.000000  1470.000000     1470.000000   
mean      1024.865306                 2.721769    65.891156        2.729932   
std        602.024335            