# Final Project (Preprocessing)

In [9]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [10]:
data = pd.read_csv("Salary_Data.csv")
data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


**Data Preprocessing For Visuals**

In [11]:
def column_encoder(data, columns): 

    if 'Education Level' in data.columns:
        data['Education Level'] = data['Education Level'].replace({
            "Bachelor's Degree": "Bachelor's",
            "Bachelor's": "Bachelor's",
            "Master's Degree": "Master's",
            "Master's": "Master's",
            "Phd": "PhD",
            "phD": "PhD"
        })
    
    if 'Job Title' in data.columns:
        data = data.drop(columns=['Job Title'])
    
    data = data.dropna(subset=columns + ['Age', 'Years of Experience', 'Salary'])
    data['Age'] = data['Age'].astype(int)
    data['Years of Experience'] = data['Years of Experience'].astype(int)
    data['Salary'] = data['Salary'].astype(int)    
    
    return data

encode_column = ["Gender","Education Level","Salary"]
data = column_encoder(data, encode_column)
data.head()

Unnamed: 0,Age,Gender,Education Level,Years of Experience,Salary
0,32,Male,Bachelor's,5,90000
1,28,Female,Master's,3,65000
2,45,Male,PhD,15,150000
3,36,Female,Bachelor's,7,60000
4,52,Male,Master's,20,200000


In [12]:
data.to_csv("Visual_Salary_Data.csv", index=False)
print("Preprocessed data saved as 'Preprocessed_Salary_Data.csv'")

Preprocessed data saved as 'Preprocessed_Salary_Data.csv'


**Additional Data Preprocessing For Prediction Models**

In [13]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_data(data, encode_columns, standardize_columns):
    labelencoder = LabelEncoder()
    for column in encode_columns:
        data[column] = labelencoder.fit_transform(data[column])
    
    scaler = StandardScaler()
    data[standardize_columns] = scaler.fit_transform(data[standardize_columns])
    
    return data

encode_columns = ["Gender", "Education Level"]
standardize_columns = ["Age", "Years of Experience", "Salary"]

processed_data = preprocess_data(data, encode_columns, standardize_columns)

data.head()


Unnamed: 0,Age,Gender,Education Level,Years of Experience,Salary
0,-0.213129,1,0,-0.510515,-0.479849
1,-0.738393,0,2,-0.840497,-0.953461
2,1.49398,1,3,1.139392,0.656819
3,0.312135,0,0,-0.180534,-1.048183
4,2.413192,1,2,1.964345,1.604042


In [14]:
data.to_csv("Preprocessed_Salary_Data.csv", index=False)
print("Preprocessed data saved as 'Preprocessed_Salary_Data.csv'")

Preprocessed data saved as 'Preprocessed_Salary_Data.csv'
