In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import sys
import os 
from datasets import load_dataset

# 🌐 Hugging Face Datasets (to download raw data)
ds = load_dataset("lukebarousse/data_jobs")

# Convert into a pandas DataFrame
import pandas as pd
df_raw = pd.DataFrame(ds['train'])

project_root = os.path.abspath("..") 

sys.path.append(os.path.abspath(".."))

from features.helpers import filter_jobs
from features.wrangling import * 

### 🔄 Data Cleaning: Locations & Job Types

This function aims to:

1. **Remove duplicates** from the dataset.  
2. **Extract the correct country** from the `job_location` column, since there were inconsistencies (e.g., `job_location = "New York"` but `job_country = "Sudan"`).  
3. **Normalize the `job_type` column** using a mapping tool to standardize job categories.


In [2]:
def wrangle_country(df):
    print(f"Initial rows: {len(df)}")
    initial_len = len(df)
    df = df.drop_duplicates()
    print(f"Duplicates removed: {initial_len - len(df)}")

    df = filter_location_matches_country(df)
    print(f"Rows after filtering location: {len(df)}")

    df = normalize_job_type(df)
    print(f"Final rows after normalizing job type: {len(df)}")
    return df
df_clean = wrangle_country(df_raw)


Initial rows: 785741
Duplicates removed: 101
Rows after filtering location: 754305
Final rows after normalizing job type: 754305


### 🧹 Removing Outliers

We will remove the outliers identified in the `salary_year_avg` column to ensure a cleaner and more accurate analysis of salaries.

In [3]:
df_cleaned = remove_outliers(df_clean) 

### 💾 Save Cleaned Data

Once the data is cleaned, we save it in the `data/clean` folder so it can be used in the **Analysis notebook**.


In [9]:
import os
import pandas as pd # Necesario si df_cleaned es un DataFrame

# 1. Define la ruta completa del archivo
CLEAN_PATH = "/Users/brian/Documents/jobs/data/clean/clean_data.csv"

# 2. Extrae solo el directorio (la ruta de la carpeta)
# Esto toma: /Users/brian/Documents/data_jobs/data/clean/
output_dir = os.path.dirname(CLEAN_PATH)

# 3. CREA LA CARPETA si no existe
# El argumento 'exist_ok=True' evita un error si la carpeta ya está ahí.
os.makedirs(output_dir, exist_ok=True)

# 4. Guarda el DataFrame
# (Asegúrate de que 'df_cleaned' esté definido en tu script)
# df_cleaned = pd.DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) # Ejemplo

df_cleaned.to_csv(CLEAN_PATH, index=False)

print(f"Archivo guardado exitosamente en: {CLEAN_PATH}")

Archivo guardado exitosamente en: /Users/brian/Documents/jobs/data/clean/clean_data.csv
