In [None]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set_theme(style="darkgrid")

### Load the dataset

In [None]:
df = pd.read_excel('./data/employes_dataset.xlsx')
df.info()

### Drop rows with missing values in the features columns

In [None]:
df = df.dropna(subset=[col for col in df.columns if col != 'Télétravail (%)'])
df.info()

### Examine if `Télétravail (%)` is correlated with other variables


Create column `Ancienneté (années)` based on `Date d'embauche` column.


In [None]:
today = datetime.today()

df["Ancienneté (années)"] = (today - df["Date d'embauche"]).dt.days/365
df["Ancienneté (années)"] = df["Ancienneté (années)"].round(2)
df.info()

Before I proceed to One-Hot-Encoding of `Pays`, `Département` and `Ville` columns.

In [None]:
# encoder = LabelEncoder()
# df['Département'] = encoder.fit_transform(df['Département'])
# df['Ville'] = encoder.fit_transform(df['Ville'])
# df['Télétravail (%)'] = encoder.fit_transform(df['Télétravail (%)'])

df = pd.get_dummies(df, columns=['Pays', 'Département', 'Ville'], drop_first=True)

In [None]:
correlation_matrix = df[[col for col in df.columns if df[col].dtype != 'object']].corr()["Télétravail (%)"].dropna().sort_values(ascending=False)
print(correlation_matrix)

### I don't see any strong correlation between `Télétravail (%)` and other columns but I will continue with `Pays`, `Département` and `Ville` columns as features.

In [None]:
# Select columns who match the pattern Pays*, Département* and Ville*
print(df.shape)

df = df.loc[:,df.columns.str.match(r"^(Pays|Département|Ville)_\w+$") | df.columns.isin(["Télétravail (%)", "Ancienneté (années)"])]

print(df.shape)

In [None]:
df.isnull().sum()

In [None]:
df_with_empty_line = df[df["Télétravail (%)"].isna()]
df_without_empty_line = df[df["Télétravail (%)"].notna()]

In [None]:
x_train, x_test , y_train, y_test = train_test_split(df_without_empty_line.drop(columns=["Télétravail (%)"]), df_without_empty_line["Télétravail (%)"],test_size=0.2, random_state=42)

In [None]:
# scaler = StandardScaler()

# x_train = scaler.fit_transform(x_train)

# x_test = scaler.transform(x_test)

In [None]:
model = PolynomialFeatures(degree=2)
x_poly = model.fit_transform(x_train)

model_lin = LinearRegression()
model_lin.fit(x_poly, y_train)

In [None]:
y_pred = model_lin.predict(x_test)

r2 = r2_score(y_test, y_pred)
print(r2)

mse = mean_squared_error(y_test, y_pred)
print(mse)

# Let's predict the `Télétravail (%)` for the employees who have not filled it yet.

In [None]:
df_with_empty_line["Télétravail (%)"] = model.predict(df_with_empty_line.drop(columns=["Télétravail (%)"]))

df_with_empty_line[["Télétravail (%)"]]