# Data analysis
Using the data prepared for analysis from the data/processed/ directory, answer the following questions before you give the HR team your recommendation:

1. How many jobs for a particular position were found?
2. What is the average salary for each job?
3. Which city has the most job offers?
4. Where is the highest rate offered?
5. What is the difference between the offer for a regular and a senior?
6. What is the lowest possible cost to hire a team consisting of a data analyst, data engineer and data scientist?
7. Is it possible for the team to be formed in a single city? If so, where would it be cheapest?

In addition, visualize subsections 1-5 using the chart types of your choice.

Use the experience gained while working on the data from 'NoFluffJobs' to consider what simplifications have been made and how they potentially could influence the full picture of the analysis.

In [None]:
import pandas as pd

df = pd.read_csv(
                    '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\processed\\job_offers_2024_02_11.csv', 
                    sep=',',
                    decimal=',',
                    encoding='utf-8'
)
print("Step 0: Original Table - Shape:", df.shape)
df.head(2)

In [None]:
# Display df_final
print("Final dataFrame:" )
print(df.to_string(index=False))

# 1. How many jobs for a particular position

In [None]:
# Group by 'job' column and count the occurrences
job_counts = df.groupby('job').size().reset_index(name='count')

# Display the resulting DataFrame with counts for each job
job_counts

# 2. Average salary for each job

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Group by the 'job' variable and calculate the rounded average salary
average_salary_per_job = df.groupby('job')['salary_avg'].mean().round(2)

# Display the result
print(average_salary_per_job)

# 3. Which city has the most job offers + own calculations

In [None]:
# Group by 'location_city' and count the number of job offers in each city
job_offers_per_city = df.groupby('location_city')['job'].count()

# Find the city with the most job offers
city_with_most_job_offers = job_offers_per_city.idxmax()
number_of_job_offers_in_most_city = job_offers_per_city.max()

# Display the result
print(f"The city with the most job offers is {city_with_most_job_offers} with {number_of_job_offers_in_most_city} job offers.")


3.a Which city has the most jobs per particular job category

In [None]:
# Group by both 'location_city' and 'job', then count the number of jobs for each combination
jobs_per_city_per_job = df.groupby(['location_city', 'job'])['job'].count()

# Find the city with the most jobs for each job title
most_jobs_per_job_in_city = jobs_per_city_per_job.groupby('job').idxmax()
most_jobs_count_per_job_in_city = jobs_per_city_per_job.groupby('job').max()

# Display the result
for job, (city, count) in most_jobs_per_job_in_city.items():
    print(f"For job title '{job}', the city with the most jobs is '{city}' with {most_jobs_count_per_job_in_city[job]} jobs.")

3.b Counting type of location for each job category (specific location/remote)

In [None]:
# Group by 'job' and 'location_remote', then count the number of jobs for each combination
jobs_count_per_location = df.groupby(['job', 'location_remote'])['job'].count()

# Display the result
for (job, location_remote), count in jobs_count_per_location.items():
    cities = df.loc[(df['job'] == job) & (df['location_remote'] == location_remote), 'location_city'].unique()
    
    if location_remote:
        print(f"For job title '{job}', there are {count} remote job(s).")
    else:
        print(f"For job title '{job}', there are {count} job(s) available in specific cities: {', '.join(cities)}")

In [None]:
# Group by 'job' and 'location_remote', then count the number of jobs for each combination
jobs_count_per_location = df.groupby(['job', 'location_remote'])['job'].count()

# Display the result
for (job, location_remote), count in jobs_count_per_location.items():
    if not location_remote:  # Exclude remote jobs
        specific_city_data = df.loc[(df['job'] == job) & (df['location_remote'] == location_remote), 'location_city']
        
        # Extract unique cities and convert to strings
        cities = specific_city_data.unique()
        cities = [str(city) for city in cities]
        
        # Count the occurrences of each city for the particular job
        city_occurrences = specific_city_data.value_counts()
        
        print(f"For job title '{job}', there are {count} job(s) available in specific cities: {', '.join(cities)}")
        
        # Display information about city occurrences
        print(f"City occurrences for job title '{job}':")
        for city, occurrences in city_occurrences.items():
            print(f"- {city}: {occurrences} times")
        print()

# 4. Where is the highest rate offered? - counted from salary_avg

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Group by 'location_city' and calculate the average salary for each city
average_salary_per_city = df.groupby('location_city')['salary_avg'].mean()

# Find the city with the highest average salary
city_with_highest_salary = average_salary_per_city.idxmax()
highest_salary = average_salary_per_city.max()

# Display the result
print(f"The city with the highest average salary across all jobs is '{city_with_highest_salary}' with a salary of {highest_salary:.2f} PLN.")

Calculation of highest rate per particular job name

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Group by 'job' and 'location_city', then calculate the average salary for each combination
average_salary_per_city_per_job = df.groupby(['job', 'location_city'])['salary_avg'].mean()

# Find the city with the highest average salary for each job
city_with_highest_salary_per_job = average_salary_per_city_per_job.groupby('job').idxmax()
highest_salary_per_job = average_salary_per_city_per_job.groupby('job').max()

# Display the result
for job, city_index in city_with_highest_salary_per_job.items():
    city = city_index[1]
    print(f"For job name '{job}', the city with the highest average salary is '{city}' with a salary of {highest_salary_per_job[job]:.2f} PLN")

# 5. What is the difference between the offer for a regular and a senior?
 Grouped by both 'job' and 'is_senior' columns and calculated the average salary for each combination. The output shows the average salary for regular and senior positions for each job title.

In [None]:
import pandas as pd

# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Separate data for regular and senior positions
regular_data = df[df['is_senior'] == 0]
senior_data = df[df['is_senior'] == 1]

# Calculate overall average salary for regular and senior positions
overall_avg_salary_regular = regular_data['salary_avg'].mean()
overall_avg_salary_senior = senior_data['salary_avg'].mean()

# Display the result
print(f"Overall average salary for Regular positions: {overall_avg_salary_regular:.2f} PLN.")
print(f"Overall average salary for Senior positions: {overall_avg_salary_senior:.2f} PLN.")

# Calculate and display the difference
difference = overall_avg_salary_senior - overall_avg_salary_regular
print(f"Difference in average salary (Senior - Regular): {difference:.2f} PLN.")


# ... and further calculation for particular job name

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Calculate average salary for regular and senior positions
average_salary_per_seniority = df.groupby(['job', 'is_senior'])['salary_avg'].mean()

# Display the result
for job in df['job'].unique():
    avg_salary_regular = average_salary_per_seniority.get((job, 0), 0)
    avg_salary_senior = average_salary_per_seniority.get((job, 1), 0)

    print(f"For job title '{job}':")
    print(f"  Average salary for Regular: {avg_salary_regular:.2f} PLN")
    print(f"  Average salary for Senior: {avg_salary_senior:.2f} PLN")
    print()

# 6. What is the lowest possible cost to hire a team consisting of a data analyst, data engineer and data scientist?

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Filter data for each role
data_analyst_data = df[df['job'] == 'data analyst']
data_engineer_data = df[df['job'] == 'data engineer']
data_scientist_data = df[df['job'] == 'data scientist']

# Calculate the lowest possible cost for each role
lowest_cost_data_analyst = data_analyst_data['salary_avg'].min()
lowest_cost_data_engineer = data_engineer_data['salary_avg'].min()
lowest_cost_data_scientist = data_scientist_data['salary_avg'].min()

# Sum the lowest costs for each role to get the total cost
total_lowest_cost = lowest_cost_data_analyst + lowest_cost_data_engineer + lowest_cost_data_scientist

# Display the result
print(f"The lowest possible cost to hire a team is: {total_lowest_cost:.2f} PLN.")

# 7. Is it possible for the team to be formed in a single city? If so, where would it be cheapest?

In [None]:
# Convert 'salary_avg' column to numeric
df['salary_avg'] = pd.to_numeric(df['salary_avg'], errors='coerce')

# Filter data for each role
data_analyst_data = df[df['job'] == 'data analyst']
data_engineer_data = df[df['job'] == 'data engineer']
data_scientist_data = df[df['job'] == 'data scientist']

# Find the city with the lowest salary for each role
city_lowest_cost_data_analyst = data_analyst_data.loc[data_analyst_data['salary_avg'].idxmin(), 'location_city']
city_lowest_cost_data_engineer = data_engineer_data.loc[data_engineer_data['salary_avg'].idxmin(), 'location_city']
city_lowest_cost_data_scientist = data_scientist_data.loc[data_scientist_data['salary_avg'].idxmin(), 'location_city']

# Display the result
print(f"The city with the lowest salary for a Data Analyst is: {city_lowest_cost_data_analyst}")
print(f"The city with the lowest salary for a Data Engineer is: {city_lowest_cost_data_engineer}")
print(f"The city with the lowest salary for a Data Scientist is: {city_lowest_cost_data_scientist}")


# move ntb to ntbs folder

In [None]:
import shutil
import os

def move_notebook_to_notebooks(notebook_path, notebooks_folder):
    notebook_filename = os.path.basename(notebook_path)

    # Move the notebook file to the "notebooks" subfolder
    shutil.move(notebook_path, os.path.join(notebooks_folder, notebook_filename))
    print(f"Notebook '{notebook_filename}' moved to the 'notebooks' subfolder.")

# Example usage with specific paths
notebook_path = "\\SESSION 6 WORKSHOP\\WEBscraping\\4.0_data_analysis.ipynb"
notebooks_folder = "\\SESSION 6 WORKSHOP\\Phyton_Workshop\\notebooks"

move_notebook_to_notebooks(notebook_path, notebooks_folder)