In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

df = pd.read_csv("IBM-HR-Analytics-Employee-Attrition-and-Performance-Revised.csv")

df['AttritionFlag'] = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)
df['OverTimeFlag'] = df['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)

label_cols = df.select_dtypes(include='object').columns
label_encoders = {col: LabelEncoder().fit(df[col]) for col in label_cols}
for col in label_cols:
    df[col] = label_encoders[col].transform(df[col])

scaler = MinMaxScaler()
scaled_cols = df.select_dtypes(include=np.number).columns
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])


In [None]:
df['PerformanceScore'] = df[['JobInvolvement', 'PerformanceRating']].mean(axis=1)
df['PotentialScore'] = df[['TotalWorkingYears', 'Education']].mean(axis=1)

def odiorne_category(row):
    if row['PerformanceScore'] >= 0.5 and row['PotentialScore'] >= 0.5:
        return 'Estrella'
    elif row['PerformanceScore'] >= 0.5:
        return 'Empleado al límite'
    elif row['PotentialScore'] >= 0.5:
        return 'Empleado problema'
    else:
        return 'Prescindible'

df['OdiorneCategory'] = df.apply(odiorne_category, axis=1)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

attitudinal_vars = [
    'JobInvolvement', 'JobSatisfaction', 'EnvironmentSatisfaction',
    'WorkLifeBalance', 'Education', 'YearsAtCompany',
    'RelationshipSatisfaction', 'TrainingTimesLastYear'
]

X_att = df[attitudinal_vars]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_att)

kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_att)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score

features = df.drop(columns=['Attrition', 'AttritionFlag', 'OdiorneCategory'])
X = features
y = df['AttritionFlag']

X_maj = X[y == 0]
y_maj = y[y == 0]
X_min = X[y == 1]
y_min = y[y == 1]

X_min_upsampled, y_min_upsampled = resample(X_min, y_min, replace=True, n_samples=len(y_maj), random_state=42)
X_balanced = pd.concat([X_maj, X_min_upsampled])
y_balanced = pd.concat([y_maj, y_min_upsampled])

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.25, stratify=y_balanced, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

df['AttritionScore'] = model.predict_proba(X)[:, 1]

In [None]:
def score_level(score):
    if score >= 0.70:
        return 'Alto'
    elif score >= 0.40:
        return 'Medio'
    else:
        return 'Bajo'

df['RiesgoRotacion'] = df['AttritionScore'].apply(score_level)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm')
plt.title("Mapa de Calor de Correlaciones")
plt.show()

sns.boxplot(x='OdiorneCategory', y='MonthlyIncome', data=df)
plt.title("Ingreso mensual por categoría Odiorne")
plt.xticks(rotation=45)
plt.show()

sns.boxplot(x='OdiorneCategory', y='Age', data=df)
plt.title("Edad por categoría Odiorne")
plt.xticks(rotation=45)
plt.show()

sns.barplot(x='OdiorneCategory', y='OverTimeFlag', data=df)
plt.title("% de horas extra por categoría Odiorne")
plt.xticks(rotation=45)
plt.show()