In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(
                    '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\processed\\job_offers_2024_02_11.csv', 
                    sep=',',
                    decimal=',',
                    encoding='utf-8'
)

# 1. task: How many jobs for a particular position were found?

In [None]:
# Group by 'job' column and count the occurrences
job_counts = df.groupby('job').size().reset_index(name='count')

# Create a bar chart
plt.figure(figsize=(5, 5))
bars = plt.bar(job_counts['job'], job_counts['count'], color='skyblue')
plt.xlabel('Job')
plt.ylabel('Number of Occurrences')
plt.title('Number of Occurrences for Each Job')

# Add numeric values on top of each bar
for i, (bar, value) in enumerate(zip(bars, job_counts['count'])):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, f'{value}', ha='center', va='bottom')

plt.xticks(rotation=45, ha='right')
plt.show()
plt.tight_layout()

# 2. task: What is the average salary for each job?

In [None]:
import seaborn as sns

# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Sort the DataFrame by 'salary_avg' in ascending order
df_sorted = df.sort_values(by='salary_avg', ascending=True)

# Create a bar chart with ci=None
plt.figure(figsize=(10, 5))
bars = sns.barplot(x='salary_avg', y='job', data=df_sorted, color='skyblue', ci=None)

# Add labels to the bars inside
for bar in bars.patches:
    plt.text(bar.get_width() - 0.1, bar.get_y() + bar.get_height() / 2, 
             f'{bar.get_width():,.2f}', 
             va='center', ha='left', color='black')

plt.xlabel('Average Salary')
plt.ylabel('Job Title')
plt.title('Average Salary for Each Job Title')
plt.show()

# 3. task: Which city has the most job offers?

In [None]:
import seaborn as sns

# Group by 'location_city' and count the number of job offers in each city
job_offers_per_city = df.groupby('location_city')['job'].count()

# Sort the DataFrame by number of job offers in descending order
job_offers_per_city_sorted = job_offers_per_city.sort_values(ascending=False)

# Create a bar chart
plt.figure(figsize=(12, 8))
bars = sns.barplot(x=job_offers_per_city_sorted.values, y=job_offers_per_city_sorted.index, color='skyblue')

# Add numeric values on top of each bar
for bar, value in zip(bars.patches, job_offers_per_city_sorted.values):
    plt.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2, f'{value}', ha='left', va='center', color='black', fontsize=8)

plt.xlabel('Number of Job Offers')
plt.ylabel('City')
plt.title('Number of Job Offers in Each City')
plt.show()

Which city has the most jobs per particular job category


In [None]:
# Group by both 'location_city' and 'job', then count the number of jobs for each combination
jobs_per_city_per_job = df.groupby(['location_city', 'job'])['job'].count()

# Find the city with the most jobs for each job title
most_jobs_per_job_in_city = jobs_per_city_per_job.groupby('job').idxmax()
most_jobs_count_per_job_in_city = jobs_per_city_per_job.groupby('job').max()

# Assuming 'jobs_per_city_per_job' Series has been created
plt.figure(figsize=(10, 5))

# Iterate through each job title and plot the city with the most jobs
for job, (city, count) in most_jobs_per_job_in_city.items():
    bar_label = f'{job} - {city}\nJobs: {most_jobs_count_per_job_in_city[job]}'
    plt.bar(f'{job} - {city}', most_jobs_count_per_job_in_city[job], color='orange')
    plt.text(f'{job} - {city}', most_jobs_count_per_job_in_city[job] + 0.3,
             f'{most_jobs_count_per_job_in_city[job]}', 
             va='center', ha='center', color='black')

plt.xlabel('Job - City')
plt.ylabel('Number of Jobs')
plt.title('City with the Most Jobs for Each Job Title')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
plt.show()

# 4. task: Where is the highest salary rate offered?

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import cm

# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Group by 'location_city' and calculate the average salary for each city
average_salary_per_city = df.groupby('location_city')['salary_avg'].mean()

# Find the city with the highest average salary
city_with_highest_salary = average_salary_per_city.idxmax()
highest_salary = average_salary_per_city.max()

# Number of unique cities
num_cities = len(average_salary_per_city)

# Choose a sequential colormap with enough colors
cmap = cm.get_cmap('viridis', num_cities)

# Assuming 'average_salary_per_city' Series has been created
plt.figure(figsize=(12, 6))

# Create a bar chart with individual colors for each bar
ax = average_salary_per_city.sort_values(ascending=False).plot(
    kind='bar', color=[cmap(i) for i in range(num_cities)], edgecolor='black')
plt.xlabel('City')
plt.ylabel('Average Salary')
plt.title('Average Salary Per City')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility

# Add numeric values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()


In [None]:
import seaborn as sns

# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Group by 'job' and 'location_city', then calculate the average salary for each combination
average_salary_per_city_per_job = df.groupby(['job', 'location_city'])['salary_avg'].mean()

# Assuming 'average_salary_per_city_per_job' DataFrame has been created
plt.figure(figsize=(12, 6))

# Create a bar chart with capsize for error bars
sns.barplot(x='job', y='salary_avg', hue='location_city', data=df, errorbar=None, capsize=0.1)
plt.xlabel('Job')
plt.ylabel('Average Salary')
plt.title('Highest Average Salary Per Job Title and Corresponding City')
plt.legend(title='City', bbox_to_anchor=(1, 1))
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility

# Add bar labels on top of each bar
for p in plt.gca().patches:
    if pd.notna(p.get_height()):  # Check for finite values
        plt.text(p.get_x() + p.get_width() / 2., p.get_height(), 
                 f'{p.get_height():,.2f}', 
                 ha='center', va='center', color='black',rotation=90)

plt.show()


# 5.task: What is the difference between the offer for a regular and a senior?

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Separate data for regular and senior positions
regular_data = df[df['is_senior'] == 0]
senior_data = df[df['is_senior'] == 1]

# Calculate overall average salary for regular and senior positions
overall_avg_salary_regular = regular_data['salary_avg'].mean()
overall_avg_salary_senior = senior_data['salary_avg'].mean()

# Calculate and display the difference
difference = overall_avg_salary_senior - overall_avg_salary_regular
print(f"Difference in average salary (Senior - Regular): {difference:.2f} PLN.")

# Create a bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(['Regular', 'Senior'], [overall_avg_salary_regular, overall_avg_salary_senior], color=['skyblue', 'orange'])

# Add numeric values on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Position')
plt.ylabel('Average Salary')
plt.title('Average Salary Comparison between Regular and Senior Positions')
plt.show()


In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Calculate average salary for regular and senior positions
average_salary_per_seniority = df.groupby(['job', 'is_senior'])['salary_avg'].mean()

# Display the result
for job in df['job'].unique():
    avg_salary_regular = average_salary_per_seniority.get((job, 0), 0)
    avg_salary_senior = average_salary_per_seniority.get((job, 1), 0)

    print(f"For job title '{job}':")
    print(f"  Average salary for Regular: {avg_salary_regular:.2f} PLN")
    print(f"  Average salary for Senior: {avg_salary_senior:.2f} PLN")
    print()

import numpy as np

# Assuming 'average_salary_per_seniority' DataFrame has been created
job_titles = df['job'].unique()

# Create an array of indices for the bar positions
indices = np.arange(len(job_titles))

# Create a grouped bar chart
plt.figure(figsize=(8, 5))
bar_width = 0.35

# Bar chart for Regular positions
bars_regular = plt.bar(indices, [average_salary_per_seniority.get((job, 0), 0) for job in job_titles], bar_width, label='Regular', color='skyblue')

# Bar chart for Senior positions
bars_senior = plt.bar(indices + bar_width, [average_salary_per_seniority.get((job, 1), 0) for job in job_titles], bar_width, label='Senior', color='orange')

# Add numeric values on top of each bar
for bar, job in zip(bars_regular, job_titles):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, f'{average_salary_per_seniority.get((job, 0), 0):.2f}', ha='center', va='bottom')

for bar, job in zip(bars_senior, job_titles):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, f'{average_salary_per_seniority.get((job, 1), 0):.2f}', ha='center', va='bottom')

plt.xlabel('Job Title')
plt.ylabel('Average Salary')
plt.title('Average Salary Comparison between Regular and Senior Positions for Each Job Title')
plt.xticks(indices + bar_width / 2, job_titles, rotation=45, ha='right')
plt.legend()
plt.show()

# 6. task: The lowest possible cost to hire team of each job name> data analyst, data engineer, data scientist

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Filter data for each role
data_analyst_data = df[df['job'] == 'data analyst']
data_engineer_data = df[df['job'] == 'data engineer']
data_scientist_data = df[df['job'] == 'data scientist']

# Calculate the lowest possible cost for each role
lowest_cost_data_analyst = data_analyst_data['salary_avg'].min()
lowest_cost_data_engineer = data_engineer_data['salary_avg'].min()
lowest_cost_data_scientist = data_scientist_data['salary_avg'].min()

# Sum the lowest costs for each role to get the total cost
total_lowest_cost = lowest_cost_data_analyst + lowest_cost_data_engineer + lowest_cost_data_scientist

# Display the result
print(f"The lowest possible cost to hire a team is: {total_lowest_cost:.2f} PLN.")

# Create a bar chart
plt.figure(figsize=(8, 6))
roles = ['Data Analyst', 'Data Engineer', 'Data Scientist']
lowest_costs = [lowest_cost_data_analyst, lowest_cost_data_engineer, lowest_cost_data_scientist]

# Bar chart for the lowest costs of each role
bars = plt.bar(roles, lowest_costs, color=['skyblue', 'orange', 'green'])

# Add numeric values on top of each bar
for bar, value in zip(bars, lowest_costs):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.3, f'{value:.2f}', ha='center', va='bottom')

plt.xlabel('Role')
plt.ylabel('Lowest Salary')
plt.title('Lowest Possible Cost to Hire a Team for Each Role')
plt.show()

In [None]:
import shutil
import os

def move_notebook_to_notebooks(notebook_path, notebooks_folder):
    notebook_filename = os.path.basename(notebook_path)

    # Move the notebook file to the "notebooks" subfolder
    shutil.move(notebook_path, os.path.join(notebooks_folder, notebook_filename))
    print(f"Notebook '{notebook_filename}' moved to the 'notebooks' subfolder.")

# Example usage with specific paths
notebook_path = "...\\SESSION 6 WORKSHOP\\TESTING\\5.0_visualization.ipynb"
notebooks_folder = "...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\notebooks"

move_notebook_to_notebooks(notebook_path, notebooks_folder)