# IT Educational Institute

In [68]:
# DataFrame path
DF_PATH = "../data/processed/1_preprocessed_df.pkl"

# Constants
NA_STRING = "Not Specified"
TRANSPARENT_STRING = "rgba(0,0,0,0)"

# Features selection variables
ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
            'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
            'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith',
            'NEWCollabToolsHaveWorkedWith']

# Export path
EXPORT_FEATURES_DIR = "../data/processed"

In [69]:
# Importing libraries
import os
import logging
import yaml
import numpy as np
import pandas as pd

## Read Data and Preprocess Data

In [70]:
# Reading data
df = pd.read_pickle(DF_PATH)

# Displaying first 5 observations 
df.head()

Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,None of these,[],,[],,[],[],[],,...,,,,[],,,,,,
1,2,I am a developer by profession,"[Employed, full-time]",Fully remote,"[Hobby, Contribute to open-source projects]",,[],[],[],,...,,,,[],,,,Too long,Difficult,
2,3,"I am not primarily a developer, but I write co...","[Employed, full-time]","Hybrid (some remote, some in-person)",[Hobby],"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","[Books / Physical media, Friend or family memb...","[Technical documentation, Blogs, Programming G...",[],14.0,...,,,,[],,,,Appropriate in length,Neither easy nor difficult,40205.0
3,4,I am a developer by profession,"[Employed, full-time]",Fully remote,[I don’t code outside of work],"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","[Books / Physical media, School (i.e., Univers...",[],[],20.0,...,,,,[],,,,Appropriate in length,Easy,215232.0
4,5,I am a developer by profession,"[Employed, full-time]","Hybrid (some remote, some in-person)",[Hobby],"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","[Other online resources (e.g., videos, blogs, ...","[Technical documentation, Blogs, Stack Overflo...",[],8.0,...,,,,[],,,,Too long,Easy,


### One-Hot Encoding

In [71]:
# importing multilabelbinarizer from sklearn
from sklearn.preprocessing import MultiLabelBinarizer

# Applying One-Hot Encoding to the DataFrame for specified columns
encoded_dfs = {}

for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]), 
                             columns=binarizer.classes_,
                             index=df[col].index)
    encoded_dfs[col] = encoded_df
    
# Merge One-Hot Encoded data
ohe_df = pd.concat(encoded_dfs, axis=1)

### Dimensionality Reduction

In [72]:
# Creating skills DataFrame from ohe_df
skills_ohe = ohe_df.drop('DevType', axis=1).copy()

# Importing StandardScaler from sklearn
from sklearn.preprocessing import StandardScaler

# Creating a standardized DataFrame of the skills_ohe
std_skills = StandardScaler().fit_transform(skills_ohe)

In [73]:
# Importing TSNE from sklearn
from sklearn.manifold import TSNE

# Applying TSNE om standardized skills
tsne_projection = TSNE(n_components=2,
                      perplexity=3,
                      learning_rate=0.01,
                      init='pca',
                      method='barnes_hut',
                      n_jobs=2,
                      n_iter=10**10,
                      random_state=0).fit_transform(std_skills.T)

# Creating a DataFrame with TSNE projections
tsne_projection = pd.DataFrame(tsne_projection, index=skills_ohe.columns)


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



In [74]:
# Checking the shape of tsne_projection
print(tsne_projection.shape)

(161, 2)


In [75]:
tsne_projection.iloc[:5, :]

Unnamed: 0,Unnamed: 1,0,1
LanguageHaveWorkedWith,APL,28.586397,19.080084
LanguageHaveWorkedWith,Assembly,84.378395,-7.259577
LanguageHaveWorkedWith,Bash/Shell,56.820904,-7.117048
LanguageHaveWorkedWith,C,87.490921,-6.603626
LanguageHaveWorkedWith,C#,-45.551907,105.968529


In [76]:
tsne_projection.columns

RangeIndex(start=0, stop=2, step=1)

In [77]:
# importing plolty.express as px
import plotly.express as px

# Creating a scatterplot of tsne_projections
fig = px.scatter(x=tsne_projection[0], y=tsne_projection[1], text=tsne_projection.index.droplevel())
fig.update_traces(textposition='top center')
fig.update_layout(width=1000, height=1000, title='TSNE')
fig.show()
fig.write_html(os.path.join('../reports/figures', 'tsne_scatterplot.html'))

### Cluster

In [78]:
# Importing libraries
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Creating variables for clustering
range_n_cluster = list(range(10, 25))
silhouette_scores = []
best_cluster_model = None

# Applying Agglomerative cluserting and finding the best model
for n_clusters in range_n_cluster:
    cluster_model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = cluster_model.fit_predict(tsne_projection)
    
    silhouette_avg = silhouette_score(tsne_projection, cluster_labels)
    silhouette_scores += [silhouette_avg]
    
    if silhouette_avg >= np.max(silhouette_scores):
        best_cluster_model = cluster_model

In [79]:
# Creating a plot showing the range of clusters and the silhouette score with best model parameters
fig = px.line(x=range_n_cluster, y=silhouette_scores, labels=dict(x='No. Clusters', y='Silhouette Score'))
fig.add_vline(best_cluster_model.n_clusters)
fig.update_layout(width=800, height=600, title='Best Clustering Model')
fig.show()
fig.write_html(os.path.join('../reports/figures', 'cluserting_lineplot.html'))

In [80]:
# Creating cluster labels
cluster_labels = ['skills_group_' + str(label) for label in best_cluster_model.labels_]

In [81]:
# Creating a scatter plot of tsne_projections categorized by the clustering labels
fig = px.scatter(x=tsne_projection[0], y=tsne_projection[1],
                text=tsne_projection.index.droplevel(), color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(width=1000, height=800, title_text='Clustered Scatter Plot')
fig.show()
fig.write_html(os.path.join('../reports/figures', 
                            'cluserting_scatterplot.html'))

### Create New Features

In [82]:
# Creating a skills_clusters variable
skills_clusters = tsne_projection.index.droplevel(level=0).to_series().groupby(cluster_labels).apply(list)

# Displaying 10 observations of the skills_clusters
for cluster, label in skills_clusters[:10].items():
    print(cluster)
    print(label)
    print('-'*100)

skills_group_0
['APL', 'COBOL', 'Clojure', 'Crystal', 'F#', 'Fortran', 'Haskell', 'Julia', 'LISP', 'OCaml', 'Perl', 'R', 'SAS', 'CouchDB', 'Couchbase', 'IBM DB2', 'Colocation', 'IBM Cloud or Watson', 'OpenStack', 'Tidyverse', 'Flow', 'Emacs', 'RStudio']
----------------------------------------------------------------------------------------------------
skills_group_1
['HTML/CSS', 'JavaScript', 'Solidity', 'TypeScript', 'MongoDB', 'Heroku', 'Angular', 'Angular.js', 'Deno', 'Express', 'Fastify', 'Gatsby', 'Next.js', 'Node.js', 'Nuxt.js', 'React.js', 'Svelte', 'Vue.js', 'Electron', 'React Native', 'Yarn', 'npm', 'Visual Studio Code']
----------------------------------------------------------------------------------------------------
skills_group_10
['Elixir', 'Erlang', 'Phoenix']
----------------------------------------------------------------------------------------------------
skills_group_11
['Capacitor', 'Cordova', 'Ionic']
-------------------------------------------------------------

In [83]:
# Creating new features
new_features = []

for cluster, skills in skills_clusters.items():
    cluster_sum = skills_ohe.droplevel(level=0, axis=1)[skills].sum(axis=1)
    cluster_sum.name = cluster
    new_features.append(cluster_sum)
    
fe_clustered_skills = pd.concat(new_features, axis=1)

In [84]:
# Displaying first 5 observations from the clustered DataFrame
fe_clustered_skills.head()

Unnamed: 0,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,0,0,0
2,0,3,0,0,0,0,2,4,0,1,0,0,0
3,0,3,0,0,0,0,0,8,0,0,0,0,0
4,0,7,0,0,2,1,0,9,3,0,0,0,1


## Illustration Model

### Train/Test Split

In [85]:
# Concatenating the clustered skills with the standardized skills DataFrame
combined_features_df = pd.concat([fe_clustered_skills, 
                                  skills_ohe.droplevel(level=0, axis=1)], 
                                axis=1)

# Creating roles_df from one-hot encoded dataframe
roles_df = ohe_df['DevType'].copy()

In [86]:
# importing train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Applying train/test split to combined_features_df
X_train, X_test, y_train, y_test = train_test_split(combined_features_df, roles_df, test_size=0.3, random_state=0)

# Checking the shapes of train/test datasets
print("X_train shape is", X_train.shape)
print('-'*50)
print("X_test shape is", X_test.shape)
print('-'*50)
print("y_train shape is", y_train.shape)
print('-'*50)
print("y_test shape is", y_test.shape)

X_train shape is (51287, 174)
--------------------------------------------------
X_test shape is (21981, 174)
--------------------------------------------------
y_train shape is (51287, 29)
--------------------------------------------------
y_test shape is (21981, 29)


### Compute Class & Samples Weight to Deal with Class Imbalance

In [87]:
# Assign class weights as an inverse of its frequency
class_weights = (1 / roles_df.sum(axis=0))

# Multiply class weight with the One-Hot encoded values and get the mea of each sample
sample_weight = np.multiply(class_weights.values, y_train.values).sum(axis=1)

In [88]:
# Creating a function to calculate f1 from confusion matrix
def f1_from_confusion_matrix(confusion_matrix):
    return (confusion_matrix[1, 1] / 
            (confusion_matrix[1,1] + 
            (0.5 * (confusion_matrix[0,1] + confusion_matrix[1,0]))
            ))

In [89]:
# Creating features_set variable which has original skills with clustered skills
features_sets = {'original': skills_ohe.droplevel(level=0, axis=1).columns.tolist(),
                'clusters': fe_clustered_skills.columns.tolist()}

In [90]:
# Importing libraries
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import multilabel_confusion_matrix

In [91]:
# Applying MultiOutputClassifier to the train and test set and calculating the F1 scores
results = {}

for feature_set_name, feature_set in features_sets.items():
    # Create subsets of train and test
    sub_train = X_train[feature_set].copy()
    sub_test = X_test[feature_set].copy()
    
    # Train classifier
    clf = MultiOutputClassifier(LogisticRegression())
    clf.fit(sub_train, y_train, sample_weight=sample_weight)
    
    # Calculate F1 for train data
    multilabel_confusion_matrices = multilabel_confusion_matrix(y_train, clf.predict(sub_train))
    f1_train_scores = [f1_from_confusion_matrix(matrix) for matrix in multilabel_confusion_matrices]
    
    # Calculate F1 for test data
    multilabel_confusion_matrices = multilabel_confusion_matrix(y_test, clf.predict(sub_test))
    f1_test_scores = [f1_from_confusion_matrix(matrix) for matrix in multilabel_confusion_matrices]
    
    # Add to results
    set_result = pd.DataFrame({'train': f1_train_scores, "test": f1_test_scores},
                             index=roles_df.columns.tolist())
    results[feature_set_name] = set_result.sort_values('test')
    
    # Displaying results
    print("Feature set: " + feature_set_name)
    print('.. Mean train F1:', np.mean(f1_train_scores))
    print('.. Mean Test F1:', np.mean(f1_test_scores))
    print('-'*100)

Feature set: original
.. Mean train F1: 0.11895910046164151
.. Mean Test F1: 0.11750875850568084
----------------------------------------------------------------------------------------------------
Feature set: clusters
.. Mean train F1: 0.15225492731960355
.. Mean Test F1: 0.15215433034933049
----------------------------------------------------------------------------------------------------


### Export New Features

In [92]:
# Creating an export path variable
features_path = os.path.join(EXPORT_FEATURES_DIR, 'features_skills_clusters.pkl')

# Saving the clustered skills in a pickle file
fe_clustered_skills.to_pickle(features_path)

In [93]:
# importing Yaml library
import yaml

# Creating a description file using yaml
description_path = os.path.join(EXPORT_FEATURES_DIR, 'features_skills_clusters_description.yaml')
with open(description_path, 'w') as outfile:
    yaml.dump(skills_clusters.to_dict(), outfile) 