IMPORTING NECESSARY LIBRARIES

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

LOADING THE DATASET

In [78]:
salary = pd.read_csv('salary.csv')
salary

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


CHECKING FOR INFORMATION ABOUT THE DATASET

In [79]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          607 non-null    int64 
 1   work_year           607 non-null    int64 
 2   experience_level    607 non-null    object
 3   employment_type     607 non-null    object
 4   job_title           607 non-null    object
 5   salary              607 non-null    int64 
 6   salary_currency     607 non-null    object
 7   salary_in_usd       607 non-null    int64 
 8   employee_residence  607 non-null    object
 9   remote_ratio        607 non-null    int64 
 10  company_location    607 non-null    object
 11  company_size        607 non-null    object
dtypes: int64(5), object(7)
memory usage: 57.0+ KB


CHECKING FOR DUPLICATES

In [80]:
salary.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


CHECKING FOR MISSING VALUES

In [81]:
salary.isnull().sum()

Unnamed: 0            0
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [82]:
salary.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


DETECTING AND REMOVING OUTLIERS USUNG IQR METHOD

In [83]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

CHECKING FOR OUTLIERS IN THE SALARY COLUMN

In [84]:
# Check for outliers in salary column
outliers = detect_outliers_iqr(salary, "salary_in_usd")
print("Outliers detected:\n", outliers)




Outliers detected:
      Unnamed: 0  work_year experience_level employment_type  \
25           25       2020               EX              FT   
33           33       2020               MI              FT   
63           63       2020               SE              FT   
97           97       2021               MI              FT   
157         157       2021               MI              FT   
225         225       2021               EX              CT   
252         252       2021               EX              FT   
482         482       2022               EX              FT   
519         519       2022               SE              FT   
523         523       2022               SE              FT   

                              job_title  salary salary_currency  \
25             Director of Data Science  325000             USD   
33                   Research Scientist  450000             USD   
63                       Data Scientist  412000             USD   
97               F

REMOVING THE OUTLIERS

In [85]:
salary = salary[~salary.index.isin(outliers.index)]


In [86]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          597 non-null    int64 
 1   work_year           597 non-null    int64 
 2   experience_level    597 non-null    object
 3   employment_type     597 non-null    object
 4   job_title           597 non-null    object
 5   salary              597 non-null    int64 
 6   salary_currency     597 non-null    object
 7   salary_in_usd       597 non-null    int64 
 8   employee_residence  597 non-null    object
 9   remote_ratio        597 non-null    int64 
 10  company_location    597 non-null    object
 11  company_size        597 non-null    object
dtypes: int64(5), object(7)
memory usage: 60.6+ KB


INDENTIFICATION OF CATEFORICAL COLUMN

In [87]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['experience_level', 'employment_type', 'job_title', 'company_location', 'company_size']

ENCODING CATEGORICAL COLUMN

In [88]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for col in categorical_cols:
    salary[col] = encoder.fit_transform(salary[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary[col] = encoder.fit_transform(salary[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary[col] = encoder.fit_transform(salary[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary[col] = encoder.fit_transform(salary[col])
A value is trying to be set on a copy of a slice from a Da

In [89]:
salary.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,2,2,21,70000,EUR,79833,DE,0,12,0
1,1,2020,3,2,40,260000,USD,260000,JP,0,29,2
2,2,2020,3,2,7,85000,GBP,109024,GB,50,18,1
3,3,2020,2,2,46,20000,USD,20000,HN,0,20,2
4,4,2020,3,2,37,150000,USD,150000,US,50,48,0


In [90]:
from sklearn.model_selection import train_test_split

# Define target variable (y) and features (X)
X = salary.drop(columns=['salary_in_usd'])  # Features
y = salary['salary_in_usd']  # Target variable

# Split into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 417
Testing set size: 180


In [91]:
X_train

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,employee_residence,remote_ratio,company_location,company_size
278,278,2021,3,2,21,180000,TRY,TR,50,46,0
606,606,2022,2,2,1,200000,USD,IN,100,48,0
547,547,2022,3,2,16,130000,USD,US,100,48,1
440,440,2022,2,2,12,40000,EUR,GR,100,19,1
388,388,2022,3,2,16,155000,USD,US,100,48,1
...,...,...,...,...,...,...,...,...,...,...,...
74,74,2021,1,2,28,235000,USD,US,100,48,0
110,110,2021,3,2,37,80000,EUR,DE,50,12,0
277,277,2021,3,2,1,55000,USD,ES,100,16,0
442,442,2022,2,2,16,75000,GBP,GB,100,18,1


In [92]:
X_test

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,employee_residence,remote_ratio,company_location,company_size
113,113,2021,0,3,1,12000,USD,PK,100,48,1
488,488,2022,2,1,21,100000,USD,CA,100,48,1
139,139,2021,0,2,21,80000,USD,US,100,48,1
80,80,2021,3,2,13,67000,EUR,DE,100,12,0
403,403,2022,3,2,12,81666,USD,US,0,48,1
...,...,...,...,...,...,...,...,...,...,...,...
234,234,2021,2,2,15,180000,USD,US,100,48,0
48,48,2020,2,2,21,105000,USD,US,100,48,0
96,96,2021,0,3,1,12000,USD,BR,100,48,2
294,294,2022,2,2,16,170000,USD,US,100,48,1


In [93]:
y_train

278     20171
606    200000
547    130000
440     43966
388    155000
        ...  
74     235000
110     94564
277     55000
442     98158
106    187442
Name: salary_in_usd, Length: 417, dtype: int64

In [94]:
y_test

113     12000
488    100000
139     80000
80      79197
403     81666
        ...  
234    180000
48     105000
96      12000
294    170000
112    103160
Name: salary_in_usd, Length: 180, dtype: int64