In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
%matplotlib inline


In [2]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
df = pd.read_csv(
    Path("Resources/Proportion of women in managerial positions (%).csv"))

# Review the DataFrame
df.head()

Unnamed: 0,ref_area.label,indicator.label,source.label,time,obs_value,obs_status.label,note_classif.label,note_indicator.label,note_source.label
0,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Labour Force Survey,2021,,Unreliable,,,Repository: ILO-STATISTICS - Micro data proces...
1,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Labour Force Survey,2020,4.9,Unreliable,,,Repository: ILO-STATISTICS - Micro data proces...
2,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,HIES - Households Living Conditions Survey,2017,4.1,Unreliable,,,Repository: ILO-STATISTICS - Micro data proces...
3,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,HIES - Households Living Conditions Survey,2014,6.4,Unreliable,,,Repository: ILO-STATISTICS - Micro data proces...
4,Angola,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Employment Survey,2021,15.4,,,,Repository: ILO-STATISTICS - Micro data proces...


In [3]:
#clean the dataframe
# fill missing values
df.fillna(value='Unknown', inplace=True)

# Drop unnecessary columns
columns_to_drop = ['note_classif.label', 'note_indicator.label', 'note_source.label']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

# Rename columns
df.rename(columns={'ref_area.label': 'Area', 'indicator.label': 'Indicator', 'source.label': 'Source', 'time': 'Year', 'obs_value': 'Value'}, inplace=True)

# Convert data types
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

# Remove duplicates
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,Area,Indicator,Source,Year,Value,obs_status.label
0,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Labour Force Survey,2021,,Unreliable
1,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Labour Force Survey,2020,4.9,Unreliable
2,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,HIES - Households Living Conditions Survey,2017,4.1,Unreliable
3,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,HIES - Households Living Conditions Survey,2014,6.4,Unreliable
4,Angola,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Employment Survey,2021,15.4,Unknown


In [4]:
# Drop missing values
df.dropna(inplace=True)
df.head()

Unnamed: 0,Area,Indicator,Source,Year,Value,obs_status.label
1,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Labour Force Survey,2020,4.9,Unreliable
2,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,HIES - Households Living Conditions Survey,2017,4.1,Unreliable
3,Afghanistan,SDG indicator 5.5.2 - Proportion of women in m...,HIES - Households Living Conditions Survey,2014,6.4,Unreliable
4,Angola,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Employment Survey,2021,15.4,Unknown
5,Angola,SDG indicator 5.5.2 - Proportion of women in m...,LFS - Employment Survey,2019,13.3,Break in series


In [5]:
# Separate the target variable from the dataframe
X = df.drop('Value', axis=1)
y = df['Value']

In [6]:
# One-hot encode the data using pandas get_dummies
df = pd.get_dummies(df)

# Separate the target variable from the dataframe
X = df.drop('Value', axis=1)
y = df['Value']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
model = RandomForestRegressor()

# Train the Random Forest model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"Model RMSE: {rmse}")
print(f"Model R^2: {r2}")

Model RMSE: 3.5356668551518626
Model R^2: 0.8975240367643619
