In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


In [23]:
df = pd.read_csv("employee.csv")
print(df)


    EmployeeID   Age  Gender Department    Salary  Experience       City  \
0            1  25.0    Male         IT   50000.0           2  Bangalore   
1            2  30.0  Female         HR   60000.0           5     Mumbai   
2            3  35.0    Male    Finance   75000.0           8      Delhi   
3            4   NaN  Female         IT   55000.0           3  Bangalore   
4            5  40.0    Male    Finance  120000.0          15     Mumbai   
5            6  28.0  Female         HR   58000.0           4        NaN   
6            7  45.0    Male         IT  200000.0          20      Delhi   
7            7  45.0    Male         IT  200000.0          20      Delhi   
8            8  23.0  Female         HR   45000.0           1  Bangalore   
9            9  38.0    Male    Finance       NaN          10     Mumbai   
10          10  29.0  Female         IT   62000.0           4      Delhi   

   Attrition  
0        Yes  
1         No  
2         No  
3        Yes  
4         No

1. Missing VAlue

In [24]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
df['City'] = df['City'].fillna(df['City'].mode()[0])


In [25]:
df

Unnamed: 0,EmployeeID,Age,Gender,Department,Salary,Experience,City,Attrition
0,1,25.0,Male,IT,50000.0,2,Bangalore,Yes
1,2,30.0,Female,HR,60000.0,5,Mumbai,No
2,3,35.0,Male,Finance,75000.0,8,Delhi,No
3,4,33.8,Female,IT,55000.0,3,Bangalore,Yes
4,5,40.0,Male,Finance,120000.0,15,Mumbai,No
5,6,28.0,Female,HR,58000.0,4,Delhi,Yes
6,7,45.0,Male,IT,200000.0,20,Delhi,No
7,7,45.0,Male,IT,200000.0,20,Delhi,No
8,8,23.0,Female,HR,45000.0,1,Bangalore,Yes
9,9,38.0,Male,Finance,92500.0,10,Mumbai,No


2. Remove duplicates

In [26]:
df.drop_duplicates(inplace=True)


In [27]:
df

Unnamed: 0,EmployeeID,Age,Gender,Department,Salary,Experience,City,Attrition
0,1,25.0,Male,IT,50000.0,2,Bangalore,Yes
1,2,30.0,Female,HR,60000.0,5,Mumbai,No
2,3,35.0,Male,Finance,75000.0,8,Delhi,No
3,4,33.8,Female,IT,55000.0,3,Bangalore,Yes
4,5,40.0,Male,Finance,120000.0,15,Mumbai,No
5,6,28.0,Female,HR,58000.0,4,Delhi,Yes
6,7,45.0,Male,IT,200000.0,20,Delhi,No
8,8,23.0,Female,HR,45000.0,1,Bangalore,Yes
9,9,38.0,Male,Finance,92500.0,10,Mumbai,No
10,10,29.0,Female,IT,62000.0,4,Delhi,Yes


3. Handling Outliers (IQR Method)

In [28]:
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df['Salary'] >= lower) & (df['Salary'] <= upper)]


In [29]:
df

Unnamed: 0,EmployeeID,Age,Gender,Department,Salary,Experience,City,Attrition
0,1,25.0,Male,IT,50000.0,2,Bangalore,Yes
1,2,30.0,Female,HR,60000.0,5,Mumbai,No
2,3,35.0,Male,Finance,75000.0,8,Delhi,No
3,4,33.8,Female,IT,55000.0,3,Bangalore,Yes
4,5,40.0,Male,Finance,120000.0,15,Mumbai,No
5,6,28.0,Female,HR,58000.0,4,Delhi,Yes
8,8,23.0,Female,HR,45000.0,1,Bangalore,Yes
9,9,38.0,Male,Finance,92500.0,10,Mumbai,No
10,10,29.0,Female,IT,62000.0,4,Delhi,Yes


4. Encoding Categorical Variables

In [31]:
#Label Encoding (Binary)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df = df.copy()

for col in ['Gender', 'Attrition']:
    df.loc[:, col] = le.fit_transform(df[col])


In [32]:
df

Unnamed: 0,EmployeeID,Age,Gender,Department,Salary,Experience,City,Attrition
0,1,25.0,1,IT,50000.0,2,Bangalore,1
1,2,30.0,0,HR,60000.0,5,Mumbai,0
2,3,35.0,1,Finance,75000.0,8,Delhi,0
3,4,33.8,0,IT,55000.0,3,Bangalore,1
4,5,40.0,1,Finance,120000.0,15,Mumbai,0
5,6,28.0,0,HR,58000.0,4,Delhi,1
8,8,23.0,0,HR,45000.0,1,Bangalore,1
9,9,38.0,1,Finance,92500.0,10,Mumbai,0
10,10,29.0,0,IT,62000.0,4,Delhi,1


In [33]:
#One-Hot Encoding
df = pd.get_dummies(df, columns=['Department', 'City'], drop_first=True)


In [34]:
df

Unnamed: 0,EmployeeID,Age,Gender,Salary,Experience,Attrition,Department_HR,Department_IT,City_Delhi,City_Mumbai
0,1,25.0,1,50000.0,2,1,False,True,False,False
1,2,30.0,0,60000.0,5,0,True,False,False,True
2,3,35.0,1,75000.0,8,0,False,False,True,False
3,4,33.8,0,55000.0,3,1,False,True,False,False
4,5,40.0,1,120000.0,15,0,False,False,False,True
5,6,28.0,0,58000.0,4,1,True,False,True,False
8,8,23.0,0,45000.0,1,1,True,False,False,False
9,9,38.0,1,92500.0,10,0,False,False,False,True
10,10,29.0,0,62000.0,4,1,False,True,True,False


5. Scaling Numerical Features

In [35]:
df['Age']

Unnamed: 0,Age
0,25.0
1,30.0
2,35.0
3,33.8
4,40.0
5,28.0
8,23.0
9,38.0
10,29.0


In [36]:
df[['Age', 'Salary', 'Experience']].info()



<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, 0 to 10
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         9 non-null      float64
 1   Salary      9 non-null      float64
 2   Experience  9 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 288.0 bytes


In [37]:
df[['Age', 'Salary', 'Experience']].isnull().sum()


Unnamed: 0,0
Age,0
Salary,0
Experience,0


In [38]:
scaler = StandardScaler()

df[['Age', 'Salary', 'Experience']] = scaler.fit_transform(
    df[['Age', 'Salary', 'Experience']]
)


In [39]:
df

Unnamed: 0,EmployeeID,Age,Gender,Salary,Experience,Attrition,Department_HR,Department_IT,City_Delhi,City_Mumbai
0,1,-1.158769,1,-0.825745,-0.897226,1,False,True,False,False
1,2,-0.24073,0,-0.382061,-0.184723,0,True,False,False,True
2,3,0.677308,1,0.283465,0.52778,0,False,False,True,False
3,4,0.456979,0,-0.603903,-0.659725,1,False,True,False,False
4,5,1.595347,1,2.280041,2.190286,0,False,False,False,True
5,6,-0.607946,0,-0.470798,-0.422224,1,True,False,True,False
8,8,-1.525984,0,-1.047586,-1.134727,1,True,False,False,False
9,9,1.228132,1,1.059911,1.002782,0,False,False,False,True
10,10,-0.424338,0,-0.293324,-0.422224,1,False,True,True,False


6.Converting Data Types

In [40]:
df['EmployeeID'] = df['EmployeeID'].astype(str)
df['Experience'] = df['Experience'].astype(int)


In [41]:
df

Unnamed: 0,EmployeeID,Age,Gender,Salary,Experience,Attrition,Department_HR,Department_IT,City_Delhi,City_Mumbai
0,1,-1.158769,1,-0.825745,0,1,False,True,False,False
1,2,-0.24073,0,-0.382061,0,0,True,False,False,True
2,3,0.677308,1,0.283465,0,0,False,False,True,False
3,4,0.456979,0,-0.603903,0,1,False,True,False,False
4,5,1.595347,1,2.280041,2,0,False,False,False,True
5,6,-0.607946,0,-0.470798,0,1,True,False,True,False
8,8,-1.525984,0,-1.047586,-1,1,True,False,False,False
9,9,1.228132,1,1.059911,1,0,False,False,False,True
10,10,-0.424338,0,-0.293324,0,1,False,True,True,False


7. Feature Extraction (Sklearn)
Example: Text Feature Extraction

In [42]:
text_data = ["IT Department", "HR Team", "Finance Team", "IT Support"]

vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(text_data)

print(vectorizer.get_feature_names_out())
print(X_text.toarray())


['department' 'finance' 'hr' 'it' 'support' 'team']
[[1 0 0 1 0 0]
 [0 0 1 0 0 1]
 [0 1 0 0 0 1]
 [0 0 0 1 1 0]]


8. Removing Unnecessary Columns

In [43]:
df.drop(columns=['EmployeeID'], inplace=True)


In [44]:
df

Unnamed: 0,Age,Gender,Salary,Experience,Attrition,Department_HR,Department_IT,City_Delhi,City_Mumbai
0,-1.158769,1,-0.825745,0,1,False,True,False,False
1,-0.24073,0,-0.382061,0,0,True,False,False,True
2,0.677308,1,0.283465,0,0,False,False,True,False
3,0.456979,0,-0.603903,0,1,False,True,False,False
4,1.595347,1,2.280041,2,0,False,False,False,True
5,-0.607946,0,-0.470798,0,1,True,False,True,False
8,-1.525984,0,-1.047586,-1,1,True,False,False,False
9,1.228132,1,1.059911,1,0,False,False,False,True
10,-0.424338,0,-0.293324,0,1,False,True,True,False


9. Handling Imbalanced Data (Sklearn)
Check imbalance

In [45]:
print(df['Attrition'].value_counts())


Attrition
1    5
0    4
Name: count, dtype: int64


Upsampling Minority Class

In [46]:
majority = df[df['Attrition'] == 0]
minority = df[df['Attrition'] == 1]

minority_upsampled = resample(
    minority,
    replace=True,
    n_samples=len(majority),
    random_state=42
)

df_balanced = pd.concat([majority, minority_upsampled])


In [47]:
df_balanced

Unnamed: 0,Age,Gender,Salary,Experience,Attrition,Department_HR,Department_IT,City_Delhi,City_Mumbai
1,-0.24073,0,-0.382061,0,0,True,False,False,True
2,0.677308,1,0.283465,0,0,False,False,True,False
4,1.595347,1,2.280041,2,0,False,False,False,True
9,1.228132,1,1.059911,1,0,False,False,False,True
8,-1.525984,0,-1.047586,-1,1,True,False,False,False
10,-0.424338,0,-0.293324,0,1,False,True,True,False
5,-0.607946,0,-0.470798,0,1,True,False,True,False
10,-0.424338,0,-0.293324,0,1,False,True,True,False


In [48]:
print(df_balanced['Attrition'].value_counts())


Attrition
0    4
1    4
Name: count, dtype: int64
