# <div style="text-align: center; background-color: #595964; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">📊EDA | Visualization |Data Science Job Salaries</div>

<h3 style="text-align: left;background-color: #00BFFF; font-family:Times New Roman; color: white; padding: 14px; line-height: 1; border-radius:10px"> About Dataset📁</h3>

<h4>Data Science Job Salaries Dataset contains <mark>11 columns</mark>, each are:</h4>


* <b> <mark>1. work_year</mark></b>: The year the salary was paid.
* <b> <mark>2. experience_level</mark></b>: The experience level in the job during the year
* <b> <mark>3. employment_type</mark></b>: The type of employment for the role
* <b> <mark>4. job_title</mark></b>: The role worked in during the year.
* <b> <mark>5. salary</mark></b>: The total gross salary amount paid.
* <b> <mark>6. salary_currency</mark></b>: The currency of the salary paid as an ISO 4217 currency code.
* <b> <mark>7. salaryinusd</mark></b>: The salary in USD 
* <b> <mark>8. company_location</mark></b>: The country of the employer's main office or contracting branch
* <b> <mark>9. company_size</mark></b>: The median number of people that worked for the company during the year

<a id="1"></a>
# <div style="text-align: center; background-color: #00BFFF; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">1. Import Necessary Libraries</div>

In [None]:
!pip install ydata-profiling

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import missingno as mno
import plotly.offline as pyo 
import plotly.figure_factory as ff
import plotly.io as pio
color_pal = sns.color_palette()
plt.style.use('seaborn-dark-palette')
plt.style.use('dark_background')
import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='colorblind')
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

#Model
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

<a id="1"></a>
# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">2. 📊EDA </div>

In [None]:
df = pd.read_csv('//kaggle/input/data-science-salary-2021-to-2023/Data Science Salary 2021 to 2023.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
cols = df.columns
cols

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
df.describe(include = 'object').T

In [None]:
df.info()

In [None]:
unique_values = df.nunique()
unique_values

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">3. Null values</div>


In [None]:
df.isna().sum()

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">4. Duplicate rows</div>
 

In [None]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]

# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

In [None]:
# Drop duplicates rows
df.drop_duplicates(keep='first', inplace=True)

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">5. Data visualisation</div>
 


In [None]:
# Import the ProfileReport class from the ydata_profiling library
from ydata_profiling import ProfileReport

# Create a comprehensive profile report for the DataFrame 'df'
# This report will contain various statistics, insights, and visualizations about the data
profile = ProfileReport(df)

In [None]:
profile                     # or this one
profile.to_notebook_iframe() # use this line to show the output

In [None]:
# Calculate the median salary for each work year
monthly_trends = df.groupby('work_year')['salary_in_usd'].median().reset_index()

# Create a line plot using Plotly Express
# x-axis: Work years, y-axis: Median salary,
# labels for the x-axis, title, and customized height
fig_monthly_trends = px.line(
    monthly_trends,  # DataFrame containing the data
    x='work_year',   # x-values: work years
    y='salary_in_usd',  # y-values: median salary
    labels={'work_year': 'Year'},  # Customize label for the x-axis
    title='Yearly Salary Trends',  # Set the title of the plot
    height=650  # Set the height of the plot
)

# Display the plot
fig_monthly_trends.show()

In [None]:
# Calculate the sum of salaries for each job title
monthly_trends = df.groupby('job_title')['salary_in_usd'].sum().reset_index()

# Create a line plot using Plotly Express
# x-axis: Job titles, y-axis: Sum of salaries,
# labels for the x-axis, title, and customized height
fig_monthly_trends = px.line(
    monthly_trends,  # DataFrame containing the data
    x='job_title',   # x-values: job titles
    y='salary_in_usd',  # y-values: sum of salaries
    labels={'job_title': 'Job'},  # Customize label for the x-axis
    title='Salary Trends by Job',  # Set the title of the plot
    height=800  # Set the height of the plot
)

# Display the plot
fig_monthly_trends.show()

In [None]:
# Calculate the sum of salaries for each job title
monthly_trends = df.groupby('employment_type')['salary_in_usd'].sum().reset_index()

# Create a line plot using Plotly Express
# x-axis: Job titles, y-axis: Sum of salaries,
# labels for the x-axis, title, and customized height
fig_monthly_trends = px.line(
    monthly_trends,  # DataFrame containing the data
    x='employment_type',   # x-values: job titles
    y='salary_in_usd',  # y-values: sum of salaries
    labels={'employment_type': 'employment'},  # Customize label for the x-axis
    title='Salary Trends by employment type',  # Set the title of the plot
    height=800  # Set the height of the plot
)

# Display the plot
fig_monthly_trends.show()

In [None]:
# Calculate the value counts for each unique value in the 'job_title' column
top10_job_title = df['job_title'].value_counts()[:10]

# Create a bar plot using Plotly Express
# y-axis: Count of each job title, x-axis: Job titles,
# color-coded by job titles and using a color palette
fig = px.bar(
    y=top10_job_title.values,  # y-values: count of each job title
    x=top10_job_title.index,   # x-values: job titles
    color=top10_job_title.index,  # Color the bars based on job titles
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    text=top10_job_title.values,  # Display the count values on top of the bars
    title='Top 10 Job Titles',  # Set the title of the plot
    template='plotly_dark'  # Use a dark template for the plot
)

# Update the layout of the plot
fig.update_layout(
    title_text='Job Distribution',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Job Titles",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# x-axis: "company_size", y-axis: "salary_in_usd", color-coded by "experience_level"
fig = px.bar(df, x="company_size", y="salary_in_usd", color="experience_level")

# Update the layout of the plot
fig.update_layout(
    title_text='Salary Distribution with Company Size',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Company Size",  # Label for the x-axis
    yaxis_title="Salary",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# Select the top 5 salaries in the DataFrame
top5_salary = df['salary_in_usd'].sort_values(ascending=False).head(5)

# Create a bar plot using Plotly Express
# y-axis: Salary values, x-axis: Index of the top 5 salaries,
# color-coded by the index, and using a color palette
fig = px.bar(
    y=top5_salary.values,  # y-values: top 5 salaries
    x=top5_salary.index,   # x-values: indices of the top 5 salaries
    color=top5_salary.index,  # Color the bars based on the indices
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    text=top5_salary.values,  # Display the salary values on top of the bars
    title='Top 5 Salaries',  # Set the title of the plot
    template='plotly_dark'  # Use a dark template for the plot
)

# Update the layout of the plot
fig.update_layout(
    title_text='Salary Distribution',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Salary",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# Calculate the average salary for each job title and sort in descending order
average_salary = df.groupby('job_title')['salary_in_usd'].mean().reset_index()
average_salary = average_salary.sort_values(by='salary_in_usd', ascending=False)

# Select the top 10 job titles with the highest average salary
top_10_expensive_salary = average_salary.head(10)

# Create a new figure using Plotly
fig = go.Figure()

# Add a bar trace to the figure
fig.add_trace(go.Bar(
    x=top_10_expensive_salary['job_title'],  # x-values: job titles
    y=top_10_expensive_salary['salary_in_usd']  # y-values: average salaries
))

# Update the layout and appearance of the plot
fig.update_layout(
    title='Top 10 Jobs by Salary',  # Set the title of the plot
    xaxis_title='Job',  # Label for the x-axis
    yaxis_title='Salary',  # Label for the y-axis
    paper_bgcolor='#595964',  # Background color of the plot
    template='plotly_dark',  # Use a dark template for the plot
    font=dict(color='white'),  # Set font color to white
    height=650  # Set the height of the plot
)

# Initialize Plotly for notebook integration
pyo.init_notebook_mode(connected=True)

# Display the plot
pyo.iplot(fig)

In [None]:
# Calculate the value counts for each unique value in the 'employment_type' column
type_grouped = df['employment_type'].value_counts()

# Define the desired order of employment types
e_type = ['Full-Time', 'Part-Time', 'Contract', 'Freelance']

# Create a bar plot using Plotly Express
# x-axis: Employment types, y-axis: Count of each employment type,
# color-coded by employment type, and use a predefined color palette
fig = px.bar(
    x=e_type,  # x-values: employment types
    y=type_grouped.values,  # y-values: counts of each employment type
    color=type_grouped.index,  # Color the bars based on employment type
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    template='plotly_dark',  # Use a dark template for the plot
    text=type_grouped.values  # Display the count values on top of the bars
)

# Update the layout and appearance of the plot
fig.update_layout(
    title_text='Employment Type Distribution',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Employment Type",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic"),  # Set the font size and family for the text
)

# Adjust the width of the bars
fig.update_traces(width=0.5)

# Display the plot
fig.show()

In [None]:
# Replace 'EN' with 'Entry-level/Junior' in the 'experience_level' column
df['experience_level'] = df['experience_level'].replace('EN', 'Entry-level/Junior')

# Replace 'MI' with 'Mid-level/Intermediate' in the 'experience_level' column
df['experience_level'] = df['experience_level'].replace('MI', 'Mid-level/Intermediate')

# Replace 'SE' with 'Senior-level/Expert' in the 'experience_level' column
df['experience_level'] = df['experience_level'].replace('SE', 'Senior-level/Expert')

# Replace 'EX' with 'Executive-level/Director' in the 'experience_level' column
df['experience_level'] = df['experience_level'].replace('EX', 'Executive-level/Director')

# Calculate the value counts for each unique value in the 'experience_level' column
ex_level = df['experience_level'].value_counts()

# Create a bar plot using Plotly Express
# x-axis: Count of each experience level, y-axis: Experience level categories,
# color-coded by "experience_level"
fig = px.bar(ex_level, x=ex_level, y=ex_level.values, color="experience_level")

# Update the layout of the plot
fig.update_layout(
    title_text='Experience Level Distribution',  # Set the title of the plot
    height=500,  # Set the height of the plot
    xaxis_title="Count",  # Label for the x-axis
    yaxis_title="Experience Level",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# x-axis: "work_year", y-axis: "salary_in_usd", color-coded by "experience_level",
# and create a notched box plot
fig = px.box(df, x="work_year", y="salary_in_usd", color="experience_level", notched=True)

# Update the layout of the plot
fig.update_layout(
    title_text='Salary Box Distribution',  # Set the title of the plot
    height=500,  # Set the height of the plot
    xaxis_title="Work Year",  # Label for the x-axis
    yaxis_title="Salary",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# Identify outliers in 'salary' column using the IQR method on scaled data
Q1 = df['salary_in_usd'].quantile(0.25)
Q3 = df['salary_in_usd'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['salary_in_usd'] < lower_bound) | (df['salary_in_usd'] > upper_bound)]

print("Number of outliers:", len(outliers))

# Remove outliers from the DataFrame
df = df[(df['salary_in_usd'] >= lower_bound) & (df['salary_in_usd'] <= upper_bound)]

In [None]:
# x-axis: "company_size", y-axis: "salary_in_usd", color-coded by "experience_level",
# and display additional data on hover
fig = px.histogram(df, x="company_size", y='salary_in_usd', color="experience_level", hover_data=df.columns)

# Update the layout of the plot
fig.update_layout(
    title_text='Salary Histogram Distribution',  # Set the title of the plot
    height=500,  # Set the height of the plot
    xaxis_title="Company Size",  # Label for the x-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['experience_level'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of experience_level', height=500)
fig2.show()

In [None]:
# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['company_size'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of company size', height=500)
fig2.show()

In [None]:
# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['work_year'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of work year', height=500)
fig2.show()

In [None]:
# Calculate the value counts for each unique value in the 'company_location' column
top10_company_location = df['company_location'].value_counts()[:10]

# Create a bar plot using Plotly Express
fig = px.bar(
    y=top10_company_location.values,  # Use the counts as the y-values
    x=top10_company_location.index,   # Use the unique values as the x-values
    color=top10_company_location.index,  # Color the bars based on the unique values
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    text=top10_company_location.values,  # Display the count values on top of the bars
    title='Top 10 company_location',  # Set the title of the plot
    template='plotly_dark'  # Use a dark template for the plot
)

# Update the layout of the plot
fig.update_layout(
    xaxis_title="company_location",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# Calculate the value counts for each unique value in the 'company_size' column
size = df['company_size'].value_counts()

# Create a bar plot using Plotly Express
fig = px.bar(
    y=size.values,  # Use the counts as the y-values
    x=size.index,   # Use the unique values as the x-values
    color=size.index,  # Color the bars based on the unique values
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    text=size.values,  # Display the count values on top of the bars
    title='Size of Company',  # Set the title of the plot
    template='plotly_dark'  # Use a dark template for the plot
)

# Update the layout of the plot
fig.update_layout(
    xaxis_title="Company Size",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
list(set(df.dtypes.tolist()))

df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
# Generate histograms for numerical columns in the DataFrame 'df_num'
# with a specified figure size, number of bins, and label font sizes
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">6. salary visualisation</div>

In [None]:
# x-axis: "job_title", y-axis: "salary_in_usd", color-coded by "experience_level"
fig = px.scatter(df, x="job_title", y="salary_in_usd", color="experience_level")

# Update the layout of the plot
fig.update_layout(
    title_text='Salary Job Distribution',  # Set the title of the plot
    height=800,  # Set the height of the plot
    xaxis_title="Job Title",  # Label for the x-axis
    yaxis_title="Salary",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# x-axis: "salary_in_usd", color-coded by "experience_level", and display additional data on hover
fig = px.histogram(df, x='salary_in_usd', color="experience_level", hover_data=df.columns)

# Update the layout of the plot
fig.update_layout(
    title_text='Salary Histogram Distribution',  # Set the title of the plot
    height=500,  # Set the height of the plot
    xaxis_title="Salary",  # Label for the x-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# x-axis: "company_location", y-axis: "salary_in_usd", color-coded by "experience_level"
fig = px.scatter(df, x="company_location", y="salary_in_usd", color="experience_level")

# Update the layout of the plot
fig.update_layout(
    title_text='company_location Salary Distribution',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="company_location",  # Label for the x-axis
    yaxis_title="Salary",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# Extract subsets of the data based on work year
w2020 = df.loc[df['work_year'] == 2020]
w2021 = df.loc[df['work_year'] == 2021]
w2022 = df.loc[df['work_year'] == 2022]
w2023 = df.loc[df['work_year'] == 2023]


# Prepare data for the histogram
hist_data = [w2020['salary_in_usd'], w2021['salary_in_usd'], w2022['salary_in_usd'],w2023['salary_in_usd']]
group_labels = ['2020 salary', '2021 salary', '2022 salary','2023 salary']
colors = ['white', 'red', 'blue','yellow']

# Calculate mean salaries for each work year
year_salary = pd.DataFrame(columns=['2020', '2021', '2022','2023'])
year_salary['2020'] = w2020.groupby('work_year').mean('salary_in_usd')['salary_in_usd'].values
year_salary['2021'] = w2021.groupby('work_year').mean('salary_in_usd')['salary_in_usd'].values
year_salary['2022'] = w2022.groupby('work_year').mean('salary_in_usd')['salary_in_usd'].values
year_salary['2023'] = w2023.groupby('work_year').mean('salary_in_usd')['salary_in_usd'].values
# Create a Plotly figure for bar chart
fig1 = go.Figure(data=px.bar(x=year_salary.columns, 
                            y=year_salary.values.tolist()[0],
                            color=year_salary.columns,
                            color_discrete_sequence=colors,
                            title='Mean Salary by Work Year',
                            text=np.round([num / 1000 for num in year_salary.values.tolist()[0]], 2),
                            template='plotly_dark',
                            height=500))

# Customize the appearance of the Plotly figure
fig1.update_traces(width=0.3)
fig1.update_layout(
    xaxis_title="Work Year",
    yaxis_title="Mean Salary (k)",
    font=dict(size=17, family="Franklin Gothic"))

# Display the Plotly figure
fig1.show()

In [None]:
# Extract subsets of the data based on experience level
exlevel_salary = df[['experience_level','salary_in_usd']]
entry_salary = exlevel_salary.loc[exlevel_salary['experience_level'] == 'Entry-level/Junior']
executive_salary = exlevel_salary.loc[exlevel_salary['experience_level'] == 'Executive-level/Director']
mid_salary = exlevel_salary.loc[exlevel_salary['experience_level'] == 'Mid-level/Intermediate']
senior_salary = exlevel_salary.loc[exlevel_salary['experience_level'] == 'Senior-level/Expert']

# Prepare data for the histogram
hist_data = [entry_salary['salary_in_usd'], mid_salary['salary_in_usd'], senior_salary['salary_in_usd'], executive_salary['salary_in_usd']]
group_labels = ['Entry-level/Junior', 'Mid-level/Intermediate', 'Senior-level/Expert', 'Executive-level/Director']
colors = ['white', 'yellow', 'blue', 'red']

# Calculate mean salaries for each experience level group
lst = [entry_salary['salary_in_usd'].mean(),
       mid_salary['salary_in_usd'].mean(),
       senior_salary['salary_in_usd'].mean(),
       executive_salary['salary_in_usd'].mean()]

# Create a Plotly figure for bar chart
fig1 = go.Figure(data=px.bar(x=group_labels, 
                            y=lst,
                            color=group_labels,
                            color_discrete_sequence=colors,
                            title='6.2.(2) Mean Salary by Experience Level',
                            text=np.round([num / 1000 for num in lst], 2),
                            template='plotly_dark',
                            height=500))

# Customize the appearance of the Plotly figure
fig1.update_traces(width=0.4)
fig1.update_layout(
    xaxis_title="Experience Level",
    yaxis_title="Mean Salary (k)",
    font=dict(size=17, family="Franklin Gothic"))

# Display the Plotly figure
fig1.show()

In [None]:
# Extract subsets of the data based on company size
c_size = df[['company_size','salary_in_usd']]
small = c_size.loc[c_size['company_size'] == 'S']
mid = c_size.loc[c_size['company_size'] == 'M']
large = c_size.loc[c_size['company_size'] == 'L']

# Prepare data for the histogram
hist_data = [small['salary_in_usd'], mid['salary_in_usd'], large['salary_in_usd']]
group_labels = ['Company Size: Small', 'Company Size: Mid', 'Company Size: Large']
colors = ['white', 'red', 'blue']

# Calculate mean salaries for each company size group
lst = [small['salary_in_usd'].mean(),
       mid['salary_in_usd'].mean(),
       large['salary_in_usd'].mean()]

# Create a Matplotlib figure
plt.figure(figsize=(20, 5))

# Create a Plotly figure
fig1 = go.Figure(data=px.bar(x=group_labels,
                             y=lst,
                             color=group_labels,
                             color_discrete_sequence=colors,
                             title='6.3.(2) Mean Salary by Company Size',
                             text=np.round([num / 1000 for num in lst], 2),
                             template='plotly_dark',
                             height=500))

# Customize the appearance of the Plotly figure
fig1.update_traces(width=0.3)
fig1.update_layout(
    xaxis_title="Company Size",
    yaxis_title="Mean Salary (k)",
    font=dict(size=17, family="Franklin Gothic"))

# Display the Plotly figure
fig1.show()

# Display the Matplotlib plot
plt.show()

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">7. Categorical</div>

In [None]:
# Select columns with object (categorical) data types
num_cols = df.select_dtypes(include='object').columns.tolist()

# Initialize the LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to the selected numerical columns
for x in num_cols:  
    df[x] = le.fit_transform(df[x])

# Now, your categorical columns (excluding column 0) have been converted to numerical values
df.head()

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">8. Corr Matrix
</div>

In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
print('Top 5 Most Positively Correlated to the To salary_in_usd')
Corr_Matrix['salary_in_usd'].sort_values(ascending=False).head(5)

In [None]:
print('Top 5 Most Negatively Correlated to salary_in_usd ')
Corr_Matrix['salary_in_usd'].sort_values(ascending=True).head(5)

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">9. spliting the dataset

</div>

In [None]:
X = df.drop(columns=['salary_in_usd','salary'])
y = df['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">10. Model Building and Analysis

</div>

In [None]:
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual_salary'] = y_test
    submit['Predict_salary'] = y_pred
    submit = submit.reset_index()
    r2 = r2_score(y_test, y_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(submit.head(5))

    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">11. feature importances

</div>


In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
y_pred= model.predict(X_test)

# Residuals
residuals = y_test - y_pred

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">12. Neural network classification model


</div>


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assume 'features' are the columns you want to use for classification
features = df.drop('experience_level', axis=1)  # Replace 'target_column_name' with the actual column name

# Assume 'target' is the column you want to predict
target = df['experience_level']  # Replace 'target_column_name' with the actual column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert target labels to one-hot encoded vectors
y_train_one_hot = tf.one_hot(y_train, depth=4)
y_test_one_hot = tf.one_hot(y_test, depth=4)

# Create a classification model
model = Sequential([
    Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dense(32, activation='relu'),
    Dense(4, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train_one_hot, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test_one_hot)
print(f"Test accuracy: {accuracy:.4f}")