In [None]:
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn
%pip install openpyxl

In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
PATH = 'C:/Users/aberti/Desktop/ProjectWork_AEQUITAS_AKKODIS/data/'
df = (
    pd.read_excel(PATH + 'Dataset_2.0_Akkodis.xlsx')
      .rename(columns=lambda c: c.lstrip().title())
)
df.head()

## Cleaning

In [None]:
df = df.drop_duplicates(subset='Id', keep='last')

In [None]:
columns_to_drop = ['Id', 'Last Role', 'Year Of Insertion',
                   'Assumption Headquarters', 'Linked_Search__Key',
                   'Akkodis Headquarters']
THRESHOLD = 0.6

for col in df.columns:
  null_count = df[col].isna().sum() / df.shape[0]
  if null_count > THRESHOLD:
    columns_to_drop.append(col)
  print(f'<Column: {col}> NULL count: {null_count*100:.2f}%')
  
df = df.drop(columns=columns_to_drop)

In [None]:
for col in df.columns[df.isnull().any()].tolist():
  print(f'{col} values: {df[col].unique()} \n') # Analyze each NaN containing feature first to determine the default fill value

fill_default = {
    'Residence': 'Not Specified',
    'Protected Category': 'No',
    'Tag': 'Not Specified',
    'Study Area': 'Not Specified',
    'Sector': 'Not Specified',
    'Event_Type__Val': 'Not Specified',
    'Event_Feedback': 'Not Specified'
}
df = df.fillna(fill_default)

In [None]:
print(f'The remaining columns are:\n')
print(df.columns)
df.head()

## Feature Mapping 

###  **Candidate State**

In [None]:
lookup = 'Candidate State'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

### **Age Range**

In [None]:
lookup = 'Age Range'
custom_order = ['< 20 years', '20 - 25 years', '26 - 30 years',
                '31 - 35 years', '36 - 40 years', '40 - 45 years', '> 45 years']
df[lookup] = pd.Categorical(df[lookup], categories=custom_order, ordered=True).sort_values()

distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

In [None]:
lookup = 'Age Range'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

### **Residence**

In [None]:
residence_list = df['Residence'].unique()
state_list = [
    s for s in residence_list
    if ('(STATE)' in s)
    or ('SOUTH AFRICAN REPUBLIC' in s)
    or ('USSR' in s)
    or ('YUGOSLAVIA' in s)
]
state_list = sorted({s.split(' » ')[0].split(' ~ ')[0] for s in state_list})

italy_list = [
    s for s in residence_list
    if ('(STATE)' not in s)
    and ('USSR' not in s)
    and ('YUGOSLAVIA' not in s)
]
italy_list = sorted({s.split(' ~ ')[-1] for s in italy_list})

def map_residence(value):
    for region in italy_list:
        if region in value:
            return region
    for state in state_list:
        if state in value:
            return state
    return 'Not Specified'

df['Residence'] = df['Residence'].apply(map_residence)

In [None]:
df['Residence'] = df['Residence'].replace({
    'Türkiye': 'TURKEY',
    'USSR': 'RUSSIAN FEDERATION'
})

In [None]:
df['Residence State'] = df['Residence'].apply(
    lambda x: x if x in state_list else 'ITALY'
)
df['Residence Italian Region'] = df['Residence'].apply(
    lambda x: x if x in italy_list else 'Not in ITALY'
)
df.loc[
    (df['Residence State'] == 'ITALY')
    & (df['Residence Italian Region'] == 'Not in ITALY'),
    'Residence Italian Region'
] = 'Not Specified'

In [None]:
distrib_it = [len(df[df['Residence State'] == 'ITALY']),
                df.shape[0]-len(df[df['Residence State'] == 'ITALY'])]
labels = ['Italian Residence', 'Non-Italian Residence']
plt.bar(labels, distrib_it)
plt.title('Italian vs Non-Italian Residence Distribution')
plt.xlabel('Residence Type')

In [None]:
res_state_counts = Counter(df[df['Residence State'] != 'ITALY']['Residence State'])
res_state_df = pd.DataFrame(res_state_counts.items(), columns=['Residence State', 'Count'])
res_state_df = res_state_df.sort_values(by='Count', ascending=False)
res_state_df.head(20).plot(x='Residence State', y='Count', kind='bar', legend=False)
plt.title('Top 20 Residence States (other than Italy)')
plt.ylabel('Frequency')
plt.show()

In [None]:
df['Residence Italian Region'] = df['Residence'].apply(lambda x: x if x in italy_list else 'Not in ITALY')

In [None]:
df.loc[
    (df['Residence State'] == 'ITALY') & (df['Residence Italian Region'] == 'Not in ITALY'),
    'Residence Italian Region'
] = 'Not Specified'

In [None]:
it_reg_counts = Counter(df['Residence Italian Region'])
it_reg_df = pd.DataFrame(it_reg_counts.items(), columns=['Residence Italian Region', 'Count'])
it_reg_df = it_reg_df.sort_values(by='Count', ascending=False)
it_reg_df.head(20).plot(x='Residence Italian Region', y='Count', kind='bar', legend=False)
plt.title('Top 20 Residence Italian Regions')
plt.ylabel('Frequency')
plt.show()

In [None]:
european_countries = [
    'ALBANIA', 'AUSTRIA', 'BELARUS', 'BELGIUM', 'BULGARIA', 'CROATIA', 'CZECH REPUBLIC',
    'FRANCE', 'GERMANY', 'GREAT BRITAIN-NORTHERN IRELAND', 'GREECE', 'ITALY', 'LATVIA',
    'LITHUANIA', 'LUXEMBOURG', 'MALTA', 'MOLDOVA', 'MONACO', 'MONTENEGRO', 'NETHERLANDS',
    'NORWAY', 'POLAND', 'PORTUGAL', 'ROMANIA', 'RUSSIA', 'SAN MARINO', 'SERBIA', 'SLOVAKIA',
    'SLOVENIA', 'SPAIN', 'SWEDEN', 'SWITZERLAND', 'UKRAINE'
]
df['European Residence'] = df['Residence State'].apply(lambda x: 'Yes' if x in european_countries else 'No')

In [None]:
eu_distrib = Counter(df['European Residence'])
eu_distrib_df = pd.DataFrame(eu_distrib.items(), columns=['European Residence', 'Count'])

labels = eu_distrib_df['European Residence']
labels.replace({'Yes': 'European', 'No': 'Non-European'}, inplace=True)
sizes = eu_distrib_df['Count']

plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('European Residence Distribution')
plt.show()

The `Residence` column could then be removed.

In [None]:
df = df.drop(columns=['Residence'])

### **Sex**

In [None]:
lookup = 'Sex'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

### **Protected Category**


In [None]:
df['Protected Category'] = df['Protected Category'].replace({
    'Article 18': 'Yes',
    'Article 1': 'Yes',
    'No Article': 'No'
})

In [None]:
lookup = 'Protected Category'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

### **Tag**

In [None]:
df['Tag'] = df['Tag'].replace({
    '-': 'Not Specified',
    '.': 'Not Specified',
    'X': 'Not Specified'
})

In [None]:
lookup = 'Tag'
distrib = Counter(df[lookup].str.split(', ').explode())
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

#### **Study Area**

In [None]:
lookup = 'Study Area'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

#### **Study Title**

In [None]:
lookup = 'Study Title'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

#### **Years Experience**

In [None]:
lookup = 'Years Experience'
custom_order = ['[0]', '[0-1]', '[1-3]', '[3-5]', '[5-7]', '[7-10]', '[+10]']

df[lookup] = pd.Categorical(df[lookup], categories=custom_order, ordered=True).sort_values()

distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

In [None]:
lookup = 'Years Experience'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

### **Sector**

In [None]:
lookup = 'Sector'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

#### **Event_type__val**

In [None]:
lookup = 'Event_Type__Val'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

### **Event_feedback**

In [None]:
lookup = 'Event_Feedback'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)

## Data Visualization
### **Sex and Candidate State**

In [None]:
pivot = df.pivot_table(index='Sex', columns='Candidate State', aggfunc='size', fill_value=0)

pivot.plot(kind='bar', figsize=(10, 6))
plt.title('Candidate State by Sex')
plt.ylabel('Count')
plt.xlabel('Sex')
plt.legend(title='Candidate State', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
pivot_percentage = pivot.div(pivot.sum(axis=1), axis=0)

pivot_percentage.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Candidate State by Sex (Normalized)')
plt.ylabel('Proportion')
plt.xlabel('Sex')
plt.legend(title='Candidate State', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### **Protected Category and Candidate State**

In [None]:
pivot = df.pivot_table(index='Protected Category', columns='Candidate State', aggfunc='size', fill_value=0)
pivot_percentage = pivot.div(pivot.sum(axis=1), axis=0)

pivot_percentage.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Candidate State by Protected Category (Normalized)')
plt.ylabel('Proportion')
plt.xlabel('Protected Category')
plt.legend(title='Candidate State', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### **Age Range and Candidate State**

In [None]:
sns.histplot(
    data= df,
    x='Age Range',
    hue='Candidate State',
    multiple='stack',
    palette='Set2',
    shrink=0.8,
    legend=True
)
plt.title("Distribution of Age Ranges by Candidate State", fontsize=14)
plt.xlabel("Age Range", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(
    data=df,
    x='Candidate State',
    y=df['Age Range'].map(lambda x: int(x.split('-')[0]) if '-' in x else (19 if '<' in x else 46)),
    palette='Set3',
    hue='Candidate State',
    legend=False
)
plt.title("Candidate State by Age Range (Numerical Approximation)", fontsize=14)
plt.xlabel("Candidate State", fontsize=12)
plt.ylabel("Age Range (Approximate Numerical Value)", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


#### **Correlation**

In [None]:
df_encoded = df.copy()

age_mapping = {
    '< 20 years': 1,
    '20 - 25 years': 2,
    '26 - 30 years': 3,
    '31 - 35 years': 4,
    '36 - 40 years': 5,
    '40 - 45 years': 6,
    '> 45 years': 7
}

df_encoded['Age Range'] = df_encoded['Age Range'].map(age_mapping)

le = LabelEncoder()

for col in df_encoded.columns:
    if col != 'Age Range':
        df_encoded[col] = le.fit_transform(df_encoded[col])

correlation_matrix = df_encoded.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()