In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Replace 'path/to/estat_lfsq_ewhan2$defaultview_filtered.tsv' with the actual file path
file_path = '/content/estat_lfsa_ewhun2_filtered.tsv'

# Read the TSV file into a DataFrame
df = pd.read_table(file_path)

# Display the first few rows of the DataFrame
df.head(5)


Unnamed: 0,"freq,nace_r2,wstatus,worktime,age,sex,unit,geo\TIME_PERIOD",2023
0,"A,A,CFAM,FT,Y15-24,F,HR,AT",: u
1,"A,A,CFAM,FT,Y15-24,F,HR,CH",: u
2,"A,A,CFAM,FT,Y15-24,F,HR,EA20",: u
3,"A,A,CFAM,FT,Y15-24,F,HR,EL",: u
4,"A,A,CFAM,FT,Y15-24,F,HR,ES",: u


In [3]:
# First, reset the column name if needed (adjust if yours is different)
df.columns = ['multi_column', 'hours']

# Split the first column by commas into new columns
new_cols = ['freq', 'nace_r2', 'wstatus', 'worktime', 'age', 'sex', 'unit', 'geo']
df[new_cols] = df['multi_column'].str.split(',', expand=True)

# Drop the original combined column
df = df.drop(columns='multi_column')

# Now you can work with it
print(df.head())


  hours freq nace_r2 wstatus worktime     age sex unit   geo
0   : u    A       A    CFAM       FT  Y15-24   F   HR    AT
1   : u    A       A    CFAM       FT  Y15-24   F   HR    CH
2   : u    A       A    CFAM       FT  Y15-24   F   HR  EA20
3   : u    A       A    CFAM       FT  Y15-24   F   HR    EL
4   : u    A       A    CFAM       FT  Y15-24   F   HR    ES


In [12]:
# Extract numeric value at the start (before any non-numeric chars)
df['hours_clean'] = df['hours'].astype(str).str.extract(r'^(\d+\.?\d*)')

# Convert to float
df['hours_clean'] = pd.to_numeric(df['hours_clean'], errors='coerce')

# Check how many were successfully converted
print("Successfully cleaned numeric rows:", df['hours_clean'].notna().sum())
print("Still missing (NaN) after cleaning:", df['hours_clean'].isna().sum())


Successfully cleaned numeric rows: 48523
Still missing (NaN) after cleaning: 40998


In [20]:
df_clean=df[df['hours_clean'].notna()]
df_clean.shape

(48523, 10)

In [22]:
df_clean.head()

Unnamed: 0,hours,freq,nace_r2,wstatus,worktime,age,sex,unit,geo,hours_clean
10,35.6 u,A,A,CFAM,FT,Y15-24,F,HR,RO,35.6
11,56.5 u,A,A,CFAM,FT,Y15-24,F,HR,RS,56.5
12,42.9,A,A,CFAM,FT,Y15-24,F,HR,TR,42.9
19,51.6 u,A,A,CFAM,FT,Y15-24,M,HR,EL,51.6
21,43.2,A,A,CFAM,FT,Y15-24,M,HR,EU27_2020,43.2


In [24]:
# Features and target
X = df_clean.drop(columns=['hours', 'hours_clean', 'freq', 'unit'])
y = df_clean['hours_clean']

# One-hot encode all categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Train / Validation / Test split: 60% train, 20% val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


In [27]:
# Column transformer to apply OneHotEncoding to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='drop'  # drop non-categorical cols (none in this case)
)


In [30]:
def evaluate_model(name, model):
    # Build full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Fit on training data
    pipeline.fit(X_train, y_train)

    # Predict on validation data
    y_pred = pipeline.predict(X_val)

    # Metrics
    print(f"--- {name} ---")
    print("MAE:", mean_absolute_error(y_val, y_pred))
    # Calculate RMSE manually if squared argument is not available
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    print("RMSE:", rmse)
    print("R²:", r2_score(y_val, y_pred))
    print()
    return pipeline

In [31]:
# Linear Regression
lr_model = evaluate_model("Linear Regression", LinearRegression())

# Random Forest
rf_model = evaluate_model("Random Forest", RandomForestRegressor(random_state=42))


--- Linear Regression ---
MAE: 2.6955129131935194
RMSE: 3.692548977962478
R²: 0.8413820159578265

--- Random Forest ---
MAE: 0.6960143225141683
RMSE: 1.3502012112154411
R²: 0.978792138263151



In [33]:
# Example: Use rf_model
y_test_pred = rf_model.predict(X_test)

print("--- Final Test Performance (Random Forest) ---")
print("MAE:", mean_absolute_error(y_test, y_test_pred))
# Calculate RMSE manually by taking the square root of MSE
print("RMSE:", mean_squared_error(y_test, y_test_pred) ** 0.5)
print("R²:", r2_score(y_test, y_test_pred))

--- Final Test Performance (Random Forest) ---
MAE: 0.6978355486862443
RMSE: 1.3557230146027412
R²: 0.9782854824501379


In [43]:
import gradio as gr
import pandas as pd

# Assuming 'df' is your DataFrame that contains the features:
# 'nace_r2', 'wstatus', 'worktime', 'age', 'sex', 'geo'
# For example:
# df = pd.read_csv("your_dataset.csv")

# Extract unique sorted values for each categorical variable
nace_r2_options = sorted(df['nace_r2'].dropna().unique().tolist())
wstatus_options   = sorted(df['wstatus'].dropna().unique().tolist())
worktime_options  = sorted(df['worktime'].dropna().unique().tolist())
age_options       = sorted(df['age'].dropna().unique().tolist())
sex_options       = sorted(df['sex'].dropna().unique().tolist())
geo_options       = sorted(df['geo'].dropna().unique().tolist())

def predict_work_hours(nace_r2, wstatus, worktime, age, sex, geo):
    # Create a DataFrame with the input values
    input_data = pd.DataFrame({
        'nace_r2': [nace_r2],
        'wstatus': [wstatus],
        'worktime': [worktime],
        'age': [age],
        'sex': [sex],
        'geo': [geo]
    })

    # Use your trained model to predict work hours
    predicted_hours = rf_model.predict(input_data)[0]

    return f"Predicted Work Hours: {predicted_hours:.2f}"

# Build the Gradio interface using dropdown menus with the unique options
interface = gr.Interface(
    fn=predict_work_hours,
    inputs=[
        gr.Dropdown(choices=nace_r2_options, label="NACE R2", value=nace_r2_options[0]),
        gr.Dropdown(choices=wstatus_options, label="Work Status", value=wstatus_options[0]),
        gr.Dropdown(choices=worktime_options, label="Work Time", value=worktime_options[0]),
        gr.Dropdown(choices=age_options, label="Age Group", value=age_options[0]),
        gr.Dropdown(choices=sex_options, label="Sex", value=sex_options[0]),
        gr.Dropdown(choices=geo_options, label="Geo (Country)", value=geo_options[0])
    ],
    outputs=gr.Label(label="Prediction"),
    title="Work Hours Predictor",
    description="Select the categories for each variable to see the predicted work hours."
)

interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6bd588911fcb268019.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [44]:
pip freeze > requirements.txt
