# Transforming data

Data in the data/interim is ready for further processing. Because the format is tabular, we can process it with Pandas. Before proceeding to data analysis, this data must be cleaned:
1. From the dataset, discard offers that do not have the amount specified in PLN,
2. Change the names of job offers and cities (name and location_city columns) to lower case,
3. Add a new salary_avg column as an average of the salary_high and salary_low columns,
4. Unify the names of cities:
    - e.g. wroclove, wroclaw, wrocław should be changed to wrocław,
    - e.g. krakow, kraków +1, kraków +2 should be changed to kraków,
    - all other cases that appear should also be cleaned.

5. NoFluffJobs does not share information on the country of employment so a column location_city needs to be created or completed:
    - if the work is remote, set N/A,
    - otherwise, unless the city name suggests otherwise, set PL,
Check whether the location_country column is filled correctly

6. Add a new column is_senior, that informs whether the position is a senior one or not. To do so use the name of the position: e.g. Senior Data Analyst -> is_senior = 1,

7. Save the results to the data\processed\ directory, using the ; separator, UTF-8 encoding and without index (index=False). Take the same file name as in task 2 i.e. job_offers_yyyy_mm_dd.csv.

# Hints:
To unify the names of the cities, use the unique() method and manually correct the rows where the names are incorrect.
To check the name of the position for the word "Senior", use regular expressions and the re.match method.

In [None]:
import pandas as pd
df = pd.read_csv(
                    '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\interim\\job_offers_2024_02_11.csv', 
                    sep=',', # Set up separator based on your need
                    decimal=',',
                    encoding='utf-8'
)

print("Step 0: Original Table - Shape:", df.shape)
df.head(2)

In [None]:
# Display df
print("DataFrame:" )
print(df.to_string(index=False))

Extract inforamtion from location dictionary column and creating new ones

In [None]:
import ast
# An abstract syntax tree (AST) is a hierarchical representation of the syntactic structure of a Python program. 
# The 'ast' module allows you to parse Python source code into an AST, manipulate the AST, and generate new Python code from the modified AST.

# Creating new columns based on the information in "location" nad "salary" column

### Function to extract location_city
def extract_city(location):
    location_dict = ast.literal_eval(location) if isinstance(location, str) else {}
    return location_dict.get('city', '').lower()

# # Apply the function to create 'location_city' column
df['location_city'] = df['location'].apply(extract_city)



##### Function to extract location_country information
def extract_country(location):
    location_dict = ast.literal_eval(location) if isinstance(location, str) else {}
    return location_dict.get('country', '').lower()

# Apply the function to create 'location_country' column
df['location_country'] = df['location'].apply(extract_country)



##### Function to extract remote information
def extract_remotely(location):
    location_dict = ast.literal_eval(location) if isinstance(location, str) else {}
    return location_dict.get('remotely', '')

# Apply the function to create 'location_remote' column
df['location_remote'] = df['location'].apply(extract_remotely)

print(f"Shape:", df.shape)
df.head()

Extract of the information from salary column and creating new ones

In [None]:
##### Function to extract salary_low information
def extract_low_salary(salary):
    salary_dict = ast.literal_eval(salary) if isinstance(salary, str) else {}
    low_value = salary_dict.get('low', '')

    # Check if the value is numeric or 'N/A'
    if low_value.isdigit():
        return float(low_value)
    else:
        # You can choose to handle non-numeric values in a way that makes sense for your analysis.
        # For example, you might return a special value or handle it differently.
        return None

# Apply the function to create 'salary_high' column
df['salary_low'] = df['salary'].apply(extract_low_salary)


##### Function to extract salary_lower information
def extract_high_salary(salary):
    salary_dict = ast.literal_eval(salary) if isinstance(salary, str) else {}
    high_value = salary_dict.get('high', '')

    # Check if the value is numeric or 'N/A'
    if high_value.isdigit():
        return float(high_value)
    else:
        # You can choose to handle non-numeric values in a way that makes sense for your analysis.
        # For example, you might return a special value or handle it differently.
        return None

# Apply the function to create 'salary_low' column
df['salary_high'] = df['salary'].apply(extract_high_salary)


##### Function to extract salary_currency information
def extract_currency(salary):
    salary_dict = ast.literal_eval(salary) if isinstance(salary, str) else {}
    return salary_dict.get('currency', '')

# Apply the function to create 'location_city' column
df['salary_currency'] = df['salary'].apply(extract_currency)

print(f"Shape:", df.shape)
df.head(2)

# 1. Discard offers without the amount specified in PLN

In [None]:
#####################  Discard offers without the amount specified in PLN / when looking inside dict in very original column

# df = df[df['salary'].str.contains("'currency': 'PLN'")]

# # Display the filtered table after Step 1
# print(f"Step 1: Discard offers without PLN currency:", df.shape)

# df.head()

#######################   Discard (filterout) offers without the amount specified in PLN in currency column
df = df[df['salary_currency'] == 'PLN']

# Display the filtered table after Step 1
print(f"Step 1: Discard offers without PLN currency:", df.shape)
df.head(2)

# 2. Change names of job offers and cities to lowercase

In [None]:
# df['name'] = df['name'].str.lower()
# df['salary_currency'] = df['salary_currency'].str.lower()

df.loc[:, 'name'] = df['name'].str.lower()
df.loc[:, 'salary_currency'] = df['salary_currency'].str.lower()


# Display the table after Step 2 and its shape
print("Step 2: Change names to lowercase - Shape:", df.shape)
display(df.head(2))


# 3. Add a new salary_avg column

In [None]:
# df['salary_low'] = df['salary_low'].astype(float)
# df['salary_high'] = df['salary_high'].astype(float)
# df['salary_avg'] = (df['salary_low'] + df['salary_high']) / 2

# With loc.

df.loc[:, 'salary_low'] = df['salary_low'].astype(float)
df.loc[:, 'salary_high'] = df['salary_high'].astype(float)
df.loc[:, 'salary_avg'] = (df['salary_low'] + df['salary_high']) / 2

# Display the table after Step 3
print(f"Step 3: Add salary_avg column:")
df.head(2)

# 5. Correct information in Location_city. Remote set to N/A

In [None]:
# Rename 'zdalnie' to 'remote' in the 'location_city' column
df['location_city'] = df['location_city'].apply(lambda x: 'remote' if 'zdalnie' in str(x).lower() else x)

# Rename 'remote' to 'N/A'
df['location_city'] = df['location_city'].replace({'remote': 'N/A'}) # Define a function to set location_country based on location_city
def set_location_country(row):
    if row['location_city'] == 'N/A':
        return 'N/A'
    else:
        return row['location_country']

# Apply the function to update location_country
df['location_country'] = df.apply(set_location_country, axis=1)

df.head()


# 4. Unify the names of cities by manual correction

In [None]:
unique_cities = df['location_city'].unique().astype(str)

# Display unique city names and manually correct them
for city in unique_cities:
    print(f"Current city name: {city}")
    corrected_name = input("Enter the corrected city name (or press Enter to keep it unchanged): ").strip()
    
    if corrected_name:
       df.loc[df['location_city'] == city, 'location_city'] = corrected_name

# Display the table after Step 4
print(f"Step 4: Unify city names:")

# Display the resulting DataFrame with unified city names
unique_cities = df['location_city'].unique()
unique_cities


# 6. Add a new column is_senior

In [None]:
import re

############    LAMBDA
# data['is_senior'] = data['name'].apply(lambda x: 1 if re.match(r'.*\bSenior\b.*', x) else 0)

########### column
# def is_senior_position(position_name):
#     return 1 if re.match(r'.*\bSenior\b.*', position_name, flags=re.IGNORECASE) else 0
# df['is_senior'] = df['name'].apply(is_senior_position)

########### LOC

def is_senior_position(position_name):
    return 1 if re.match(r'.*\bSenior\b.*', position_name, flags=re.IGNORECASE) else 0

# Create a new column 'is_senior' and set values using .loc
df['is_senior'] = 0
senior_positions = df['name'].apply(is_senior_position) == 1
df.loc[senior_positions, 'is_senior'] = 1

################## Display the table after Step 6

print(f"\nStep 6: Add a new column is_senior:")

df.head(2)
df.shape

# Cleaning process (duplicates, missing values, etc.)

Show the data before celaning process for later check

In [None]:
# Group by 'job' column and count the occurrences
job_counts = df.groupby('job').size().reset_index(name='count')

# Display the resulting DataFrame with counts for each job
job_counts

print(f"Number of rows in raw df is {df.shape[0]}")

1. cleaning up step - CHECK and DELETE duplicates

In [None]:
# Create a custom condition for duplicates based on the entire text in specified columns
condition = df.duplicated(subset=['name', 'company', 'job', 'location_city', 'location_country', 'location_remote', 'is_senior'], keep=False)

# Find and display duplicate rows based on the custom condition
duplicates = df[condition]

# Display the resulting DataFrame containing duplicates based on the entire text in specified columns
print("Duplicated Rows:")

duplicates

# Print information about the number of duplicate rows
print(f"\nNumber of duplicated rows: {duplicates.shape[0]}\n")

In [None]:
# Drop duplicates based on the custom condition
df_cleaned = df.drop_duplicates(subset=['name', 'company', 'job', 'location_city', 'location_country', 'location_remote', 'is_senior'], keep='first')

# Print information about how many rows were deleted
deleted_rows = df.shape[0] - df_cleaned.shape[0]
print(f"\nNumber of rows deleted: {deleted_rows}")

print(f"df_cleaned has {df_cleaned.shape[0]} rows")

First check the data after cleaning duplicates

In [None]:
# Group by 'job' column and count the occurrences
job_counts = df_cleaned.groupby('job').size().reset_index(name='count')

# Display the resulting DataFrame with counts for each job
job_counts

2. cleaning up step - createn new column and filter out jobs where job title and job name {data - analyst, engineer, scientis} dont match. Logic - if job title does not include job name, set up the boolean value FALSE, otherwise TRUE. 
Print info about number of rows where is_present column value = TRUE

In [None]:
# Create a new column 'is_present' and populate it based on the condition (case-insensitive)
df['is_present'] = df.apply(lambda row: row['job'].lower() in row['name'].lower(), axis=1)

# Display the DataFrame with the new column
df.head(2)

# Filter out rows where 'is_present' is False
df_final = df_cleaned[df_cleaned['is_present'] == True]  # or df_cleaned[df_cleaned['is_present']]

# Display the resulting DataFrame without rows where 'is_present' is False
df_final.shape

print(f'DF FINAL after filtering match with job and name position has {df_final.shape[0]} rows')

Check the final data

In [None]:
# Group by 'job' column and count the occurrences
job_counts = df_final.groupby('job').size().reset_index(name='count')

# Display the resulting DataFrame with counts for each job
job_counts

In [None]:
# Display df_final
print("DataFrame after cleanup:" )
print(df_final.to_string(index=False))

# save the final df

In [None]:
import os
import datetime

# 7. Save the results to the data\processed\ directory
processed_dir = '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\processed'
os.makedirs(processed_dir, exist_ok=True)

# Generate the file name based on the current date
current_date = datetime.datetime.now().strftime("%Y_%m_%d")
file_name = f'job_offers_{current_date}.csv'
file_path = os.path.join(processed_dir, file_name)

# Save the DataFrame to CSV with the specified format
df_final.to_csv(file_path, sep=',', encoding='utf-8', index=False)

print(f"\nStep 7: Results saved to {file_path}")

# move ntb to ntbs folder

In [None]:
import shutil
import os

def move_notebook_to_notebooks(notebook_path, notebooks_folder):
    notebook_filename = os.path.basename(notebook_path)

    # Move the notebook file to the "notebooks" subfolder
    shutil.move(notebook_path, os.path.join(notebooks_folder, notebook_filename))
    print(f"Notebook '{notebook_filename}' moved to the 'notebooks' subfolder.")

# Example usage with specific paths
notebook_path = "...\\SESSION 6 WORKSHOP\\WEBscraping\\3.0_data_transfromation_copy.ipynb"
notebooks_folder = "...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\notebooks"

move_notebook_to_notebooks(notebook_path, notebooks_folder)