# 1. Setup and Dependencies

In [2]:
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler


# 2. Data Loading and Initial Analysis

In [5]:
expanded_df = pd.read_csv('salary.csv')

print("Expanded Dataset Details:")
print("-" * 50)
print(f"Number of rows: {expanded_df.shape[0]}")
print(f"Number of columns: {expanded_df.shape[1]}")
print("\nColumns in the dataset:")
print(expanded_df.columns.tolist())
print("\nFirst few rows of the dataset:")
print(expanded_df.head())
print("\nDataset information:")
print(expanded_df.info())
print("\nNumerical columns description:")
print(expanded_df.describe())
print("\nMissing values count:")
print(expanded_df.isnull().sum())

Expanded Dataset Details:
--------------------------------------------------
Number of rows: 6684
Number of columns: 9

Columns in the dataset:
['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience', 'Salary', 'Country', 'Race', 'Senior']

First few rows of the dataset:
    Age  Gender  Education Level          Job Title  Years of Experience  \
0  32.0    Male                1  Software Engineer                  5.0   
1  28.0  Female                2       Data Analyst                  3.0   
2   NaN    Male                3            Manager                 15.0   
3  36.0  Female                1    Sales Associate                  7.0   
4  52.0    Male                2           Director                 20.0   

     Salary Country      Race  Senior  
0   90000.0      UK     White       0  
1   65000.0     USA  Hispanic       0  
2  150000.0  Canada     White       1  
3   60000.0     USA  Hispanic       0  
4  200000.0     USA     Asian       0  

Dataset inform

# 3. Data Cleaning - Handling Missing Values

In [6]:
median_age = expanded_df['Age'].median()
expanded_df['Age'] = expanded_df['Age'].fillna(median_age)

print("\nMissing values after imputation:")
print(expanded_df.isnull().sum())


Missing values after imputation:
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
Country                0
Race                   0
Senior                 0
dtype: int64


# 4. Feature Engineering - Encoding Categorical Variables

In [9]:
df_encoded = expanded_df.copy()

le = LabelEncoder()
df_encoded['Gender'] = le.fit_transform(df_encoded['Gender'])

categorical_columns = ['Job Title', 'Country', 'Race']
encoder = OneHotEncoder(sparse_output=False, drop='first')  
encoded_features = encoder.fit_transform(df_encoded[categorical_columns])

feature_names = []
for i, column in enumerate(categorical_columns):
    feature_names.extend([f"{column}_{cat}" for cat in encoder.categories_[i][1:]])

encoded_df = pd.DataFrame(encoded_features, columns=feature_names)

final_df = pd.concat([
    df_encoded[['Age', 'Education Level', 'Years of Experience', 'Salary', 'Senior', 'Gender']],
    encoded_df
], axis=1)

print("Shape of encoded dataset:", final_df.shape)
final_df

Shape of encoded dataset: (6684, 147)


Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender,Job Title_Account Manager,Job Title_Accountant,Job Title_Administrative Assistant,Job Title_Advertising Coordinator,...,Country_USA,Race_Asian,Race_Australian,Race_Black,Race_Chinese,Race_Hispanic,Race_Korean,Race_Mixed,Race_Welsh,Race_White
0,32.0,1,5.0,90000.0,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,28.0,2,3.0,65000.0,0,0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,32.0,3,15.0,150000.0,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,36.0,1,7.0,60000.0,0,0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,52.0,2,20.0,200000.0,0,1,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,49.0,3,20.0,200000.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6680,32.0,0,3.0,50000.0,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6681,30.0,1,4.0,55000.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6682,46.0,2,14.0,140000.0,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# 5. Feature Scaling:  

In [12]:
scaler = MinMaxScaler()

final_df['Salary'] = scaler.fit_transform(final_df[['Salary']])

print("\nFirst few rows after scaling Salary (now between 0 and 1):")
print(final_df[['Salary']].head())


First few rows after scaling Salary (now between 0 and 1):
     Salary
0  0.359103
1  0.258963
2  0.599439
3  0.238935
4  0.799720


# 5. Data Splitting - Training and Testing Sets

In [13]:
X = final_df.drop('Salary', axis=1)
y = final_df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("\nNumber of training samples:", len(y_train))
print("Number of testing samples:", len(y_test))

Training set shape: (5347, 146)
Testing set shape: (1337, 146)

Number of training samples: 5347
Number of testing samples: 1337
