In [9]:
!pip install tensorflow_text



In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
import ast
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model

In [11]:
data = pd.read_csv('github_users_dataset.csv', header=None)

initial_rows = data.shape[0]

data.dropna(inplace=True)

# Set the column names from the first row
data.columns = data.iloc[0]

# Drop the first row
data = data[1:]

# Drop all rows that are the same as the column names
data = data[~data.eq(data.columns).all(1)]

# Function to check if all strings in a given input are ASCII
def all_strings_are_ascii(input):
    if isinstance(input, list):
        return all(str(s).isascii() for s in input)
    else:
        return str(input).isascii()

# Apply the function to each element of the DataFrame
data_ascii = data.applymap(all_strings_are_ascii)

# Filter out the rows where all elements are ASCII
data = data[data_ascii.all(axis=1)]

data = data.query('projects != "[]" and languages != "[]"')

final_rows = data.shape[0]

print(f'Rows removed: {initial_rows - final_rows}, {100 * (initial_rows - final_rows) / initial_rows:.2f}% of the original dataset')

Rows removed: 1235, 63.37% of the original dataset


In [12]:
data.head(10)

Unnamed: 0,role,experience_level,languages,tech_keywords,projects
5,Senior Software Engineer @microsoft,Beginner,"['Shell', 'PowerShell', 'C#', 'Python', 'JavaS...","['github', 'node', 'video', 'action', 'woke', ...",['A collection of tasks to enable execution My...
6,Principal AI Scientist at Genentech. Formerly ...,Beginner,"['Python', 'Shell', 'R', 'Jupyter Notebook', '...","['ecoli_promoter_mpra', 'training', 'women', '...","['DL based processing of atac-seq data', 'Comm..."
8,CEO and Full-Stack Developer at SULLE WAREHOUSE,Beginner,"['C', 'C++', 'CMake', 'Shell', 'Assembly']","['b', 'pixels', 'linux', 'parser', 'expression...","['The README for my personal GitHub account', ..."
9,"Machine learning, quantum computing, and every...",Beginner,"['Python', 'Jupyter Notebook', 'Shell', 'C++',...","['Cybernetics', 'elden_bot', 'quantum', 'Quant...",['Final Project for CSCI 2500 Computer Organiz...
11,"Co-founder, @levelshealth.",Beginner,"['JavaScript', 'Scala', 'CSS', 'HTML', 'PHP']","['sublime', 'play', 'app', 'workflow', 'exampl...","['Alfred App Workflow for caniuse.com', 'Stati..."
17,Lead Engineer at Nike.\n\nMostly cloud service...,Intermediate,"['JavaScript', 'TypeScript', 'PHP', 'CSS', 'HT...","['mee.js', 'Advanced', 'class.js', 'node', 'cu...",['SecretParser Plugin for the game http://www....
21,Physical oceanographer with a love of open sou...,Intermediate,"['Jupyter Notebook', 'Python', 'HTML', 'Shell'...","['MLINT', 'Notebooks', 'MITgcm66h', 'publicati...",['Repo for the April 10-12 workshop to be held...
22,"Tinker, build, make, share.",Intermediate,"['C++', 'C', 'JavaScript', 'HTML', 'CSS']","['ESP32Tests', 'eyeball_simulator', 'BlinkRC7'...",['Abandoned: attempt to make 15-tile sliding p...
26,:-),Intermediate,"['Python', 'Shell', 'C', 'Java', 'C++']","['MSRnet', 'codeBase', 'discord', 'CO224_Compu...",['ABS: Scanning Neural Networks for Back-doors...
28,"Signal processing, radar, data science",Intermediate,"['Shell', 'Batchfile', 'Python', 'Makefile', 'C']","['setup', 'asciimatics', 'adwaita', 'icon', 'c...",['A conda-smithy repository for adwaita-icon-t...


OHE for experience and language

In [13]:
# unique experience levels
experience = data['experience_level'].unique()

# map unique experience levels to numbers
# categorical data --> numerical data for one-hot encoding
experience_level_mapping = {level: idx for idx, level in enumerate(experience)}

# w gpt2 nie ma potrzeby one-hot encoding
data['experience_level_num'] = data['experience_level'].map(experience_level_mapping)

# one-hot encoding !!!!!!!!!!!!
experience_level_encoded = to_categorical(data['experience_level_num'])
experience_level_encoded


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [14]:
data['languages'] = data['languages'].apply(ast.literal_eval)
languages = set([lang for sublist in data['languages'].tolist() for lang in sublist])

mlb = MultiLabelBinarizer(classes=sorted(languages))
languages_encoded = mlb.fit_transform(data['languages'])
languages_encoded[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

BERT for role and project

In [15]:
bert_preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_model_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'

bert_preprocess_model = hub.KerasLayer(bert_preprocess_url)
bert_model = hub.KerasLayer(bert_model_url)

role_texts = data['role'].tolist()  # Convert 'role' column to a list

role_preprocessed = bert_preprocess_model(role_texts)

role_results = bert_model(role_preprocessed)
role_results.keys()

dict_keys(['pooled_output', 'encoder_outputs', 'sequence_output', 'default'])

In [16]:
roles_embedded = role_results['pooled_output']

roles_embedded.shape

TensorShape([714, 128])

In [17]:
projects = [j for i in data['projects'].tolist() for j in ast.literal_eval(i)]

projects_preprocessed = bert_preprocess_model(projects)
projects_results = bert_model(projects_preprocessed)


In [18]:
map_user_to_projects = {}
for user_num, projects_list in enumerate(data['projects']):
  map_user_to_projects[user_num] = ast.literal_eval(projects_list)

map_user_to_projects[0]


['A collection of tasks to enable execution MySQL commands or scripts by VSTS/TFS Windows Agents.',
 'Microsoft Azure Workshop for Developers',
 'PowerShell module for interacting with Bluetooth Low Energy (BLE) devices implementing the Generic Attribute Profile (GATT/GAP)',
 'Pre-commit hooks collection that utilizes ChatGPT and OpenAI platform to validate changes made to the codebase.',
 'This GitHub Action allows you to run Gitleaks in your GitHub workflow.',
 'A MkDocs plugin that lets you exclude/include docs files using globs, regexes, gitignore-style file and Markdown/FrontMatter tags metadata.',
 'MkDocs Video plugin',
 'OSC message decoder/encoder with fault tolerant',
 'Detect non-inclusive language in your source code.']

In [19]:
projects_results.keys()

dict_keys(['pooled_output', 'encoder_outputs', 'sequence_output', 'default'])

In [20]:
projects_embedded = projects_results['pooled_output']

In [21]:
for i in [experience_level_encoded,
          languages_encoded,
          roles_embedded,
          projects_embedded]: print(i.shape)

(714, 3)
(714, 109)
(714, 128)
(13513, 128)


In [81]:
user_profiles = tf.concat([roles_embedded, languages_encoded], axis=1)
user_profiles.shape

TensorShape([714, 237])

In [82]:
user_profiles[0]

<tf.Tensor: shape=(237,), dtype=float32, numpy=
array([-0.9998977 ,  0.02674753, -0.95229065,  0.07566944, -0.98473793,
        0.8515617 , -0.9891793 , -0.06120098,  0.07552765, -0.00853382,
       -0.33273837, -0.07438556, -0.13768643,  0.99698657,  0.20532432,
        0.00142703,  0.9212176 ,  0.066608  , -0.86612004, -0.28353733,
        0.5697272 ,  0.06278628, -0.8876335 , -0.82182   , -0.9968265 ,
       -0.06983151, -0.9948706 ,  0.974122  ,  0.81316996,  0.1028368 ,
        0.09937278, -0.0886519 , -0.86351496, -0.9690397 ,  0.7592347 ,
        0.996806  , -0.92572796, -0.00526048,  0.51379555, -0.9914838 ,
        0.9527194 ,  0.9840963 , -0.99205375,  0.84864724, -0.88819516,
       -0.19925441, -0.86289763,  0.9918394 , -0.5801045 ,  0.9990769 ,
       -0.5767053 , -0.9606845 , -0.01358342,  0.89274883,  0.8692733 ,
        0.59175795, -0.12701385,  0.19582532,  0.9900831 ,  0.7635511 ,
        0.08241388, -0.01632024, -0.21319991,  0.7888896 , -0.6095279 ,
       -0.998157

In [83]:
for i in range(20):
  print(f"User {i}: {map_user_to_projects[i]}")

User 0: ['A collection of tasks to enable execution MySQL commands or scripts by VSTS/TFS Windows Agents.', 'Microsoft Azure Workshop for Developers', 'PowerShell module for interacting with Bluetooth Low Energy (BLE) devices implementing the Generic Attribute Profile (GATT/GAP)', 'Pre-commit hooks collection that utilizes ChatGPT and OpenAI platform to validate changes made to the codebase.', 'This GitHub Action allows you to run Gitleaks in your GitHub workflow.', 'A MkDocs plugin that lets you exclude/include docs files using globs, regexes, gitignore-style file and Markdown/FrontMatter tags metadata.', 'MkDocs Video plugin', 'OSC message decoder/encoder with fault tolerant', 'Detect non-inclusive language in your source code.']
User 1: ['DL based processing of atac-seq data', 'Community-curated list of software packages and data resources for single-cell, including RNA-seq, ATAC-seq, etc.', 'Hackathon team: gene expression analysis for Covid-19', 'Examples of single-cell genomic an

In [84]:
data.head(15)

Unnamed: 0,role,experience_level,languages,tech_keywords,projects,experience_level_num
5,Senior Software Engineer @microsoft,Beginner,"[Shell, PowerShell, C#, Python, JavaScript]","['github', 'node', 'video', 'action', 'woke', ...",['A collection of tasks to enable execution My...,0
6,Principal AI Scientist at Genentech. Formerly ...,Beginner,"[Python, Shell, R, Jupyter Notebook, Dockerfile]","['ecoli_promoter_mpra', 'training', 'women', '...","['DL based processing of atac-seq data', 'Comm...",0
8,CEO and Full-Stack Developer at SULLE WAREHOUSE,Beginner,"[C, C++, CMake, Shell, Assembly]","['b', 'pixels', 'linux', 'parser', 'expression...","['The README for my personal GitHub account', ...",0
9,"Machine learning, quantum computing, and every...",Beginner,"[Python, Jupyter Notebook, Shell, C++, Starlark]","['Cybernetics', 'elden_bot', 'quantum', 'Quant...",['Final Project for CSCI 2500 Computer Organiz...,0
11,"Co-founder, @levelshealth.",Beginner,"[JavaScript, Scala, CSS, HTML, PHP]","['sublime', 'play', 'app', 'workflow', 'exampl...","['Alfred App Workflow for caniuse.com', 'Stati...",0
17,Lead Engineer at Nike.\n\nMostly cloud service...,Intermediate,"[JavaScript, TypeScript, PHP, CSS, HTML]","['mee.js', 'Advanced', 'class.js', 'node', 'cu...",['SecretParser Plugin for the game http://www....,1
21,Physical oceanographer with a love of open sou...,Intermediate,"[Jupyter Notebook, Python, HTML, Shell, TeX]","['MLINT', 'Notebooks', 'MITgcm66h', 'publicati...",['Repo for the April 10-12 workshop to be held...,1
22,"Tinker, build, make, share.",Intermediate,"[C++, C, JavaScript, HTML, CSS]","['ESP32Tests', 'eyeball_simulator', 'BlinkRC7'...",['Abandoned: attempt to make 15-tile sliding p...,1
26,:-),Intermediate,"[Python, Shell, C, Java, C++]","['MSRnet', 'codeBase', 'discord', 'CO224_Compu...",['ABS: Scanning Neural Networks for Back-doors...,1
28,"Signal processing, radar, data science",Intermediate,"[Shell, Batchfile, Python, Makefile, C]","['setup', 'asciimatics', 'adwaita', 'icon', 'c...",['A conda-smithy repository for adwaita-icon-t...,1


In [85]:
dataset = []

In [86]:
len(dataset)

0

In [87]:
file = open("negative_match.txt", "r")
num_matches = len(file.read().splitlines())
file.close()

start_project_range = 0
for user_num, user_profile in enumerate(user_profiles):
  for project_embedding in projects_embedded[start_project_range:start_project_range+len(map_user_to_projects[user_num])]:
    dataset.append((user_profile, project_embedding, 1))
  start_project_range += len(map_user_to_projects[user_num])
  if user_num == num_matches: break


In [88]:
len(dataset)

352

better manual method

In [89]:
# manual small dataset testing

def find_project_start_index_for_given_user(user_num):
  i = 0
  index = 0
  while i != user_num:
    index += len(map_user_to_projects[i])
    i += 1
  return index

def add_negative_example_given_non_matching_user_profiles_manually(user1: int, non_matching_user2: int):
  start_index = find_project_start_index_for_given_user(non_matching_user2)
  for i in range(len(map_user_to_projects[non_matching_user2])):
    dataset.append((user_profiles[user1], projects_embedded[start_index + i], 0))

In [90]:
file = open("negative_match.txt", "r")

lines = file.read().splitlines()
neg_matches = []

for line in lines:
  first, second = line.split(",")
  neg_matches.append((int(first), int(second)))

for first, second in neg_matches:
  add_negative_example_given_non_matching_user_profiles_manually(first, second)

file.close()


In [91]:
len(dataset)

682

In [97]:
dataset[0]

(<tf.Tensor: shape=(237,), dtype=float32, numpy=
 array([-9.99984205e-01, -4.04317444e-03, -9.87990856e-01,  8.13171208e-01,
        -9.96402264e-01,  6.88598335e-01, -9.43180501e-01, -2.81483740e-01,
         6.16724277e-03,  4.19953689e-02, -1.74680874e-01, -2.04919558e-02,
         1.07948922e-01,  9.99973834e-01, -4.31610256e-01,  1.93856716e-01,
         7.00690567e-01,  2.12279394e-01, -8.26264262e-01, -3.68542850e-01,
         8.41614902e-01, -9.58342552e-02, -3.03694725e-01, -5.40483892e-01,
        -9.86359239e-01, -1.68557525e-01, -9.99590158e-01,  3.12834799e-01,
         8.72975826e-01, -4.25130303e-04,  1.99659616e-02, -1.15537301e-01,
        -8.97573650e-01, -8.61432076e-01,  2.97605872e-01,  9.96627510e-01,
        -9.93286252e-01,  2.67962553e-02,  9.28532541e-01, -9.94452775e-01,
         9.15472984e-01,  9.36904252e-01, -9.62989688e-01,  6.92660511e-01,
        -9.98208463e-01, -1.15481384e-01, -8.80152285e-01,  9.90897298e-01,
         5.36059797e-01,  9.96086359e-0

In [98]:
import random

# Shuffle the dataset
random.shuffle(dataset)

In [99]:
# Split the dataset into features and labels
features = [(user_profile, project_embedding) for user_profile, project_embedding, _ in dataset]
labels = [label for _, _, label in dataset]

In [100]:
# Convert to numpy arrays or tensors as required for training
features = np.array(features)
labels = np.array(labels)

  features = np.array(features)


In [101]:
len(labels)

682

In [176]:
# Hyperparameters (you can adjust these based on your needs)
embedding_size = 128  # Size of the final embeddings
dropout_rate = 0.2    # Dropout rate for regularization

# User Profile Branch
user_input = Input(shape=(237,))  # Adjust the shape based on your concatenated user profile tensor
user_branch = Dense(64, activation='relu')(user_input)
user_branch = tf.keras.layers.Dropout(dropout_rate)(user_branch)
user_branch = Dense(embedding_size, activation='relu')(user_branch)

# Project Description Branch
project_input = Input(shape=(128,))  # Adjust the shape based on your BERT embeddings
project_branch = Dense(32, activation='relu')(project_input)
project_branch = tf.keras.layers.Dropout(dropout_rate)(project_branch)
project_branch = Dense(embedding_size, activation='relu')(project_branch)

In [177]:
# Distance Layer
def euclidean_distance(vectors):
    x, y = vectors
    sum_square = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

distance = Lambda(euclidean_distance)([user_branch, project_branch])

# Siamese Network Model
siamese_network = Model(inputs=[user_input, project_input], outputs=distance)

# Contrastive Loss Function
def contrastive_loss(y_true, y_pred):
    margin = 1
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

# Define a custom accuracy metric
def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return tf.keras.metrics.binary_accuracy(y_true, tf.cast(y_pred < 0.5, dtype=tf.float32))

# Compile the model with the custom accuracy metric
siamese_network.compile(optimizer='adam', loss=contrastive_loss, metrics=[accuracy])

# Model Summary
siamese_network.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_19 (InputLayer)       [(None, 237)]                0         []                            
                                                                                                  
 input_20 (InputLayer)       [(None, 128)]                0         []                            
                                                                                                  
 dense_40 (Dense)            (None, 64)                   15232     ['input_19[0][0]']            
                                                                                                  
 dense_42 (Dense)            (None, 32)                   4128      ['input_20[0][0]']            
                                                                                           

In [166]:
# prepare data for training

user_profiles, project_embeddings = zip(*features)
user_profiles = np.array(user_profiles)
project_embeddings = np.array(project_embeddings)
labels = np.array(labels)

In [167]:
# Splitting data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    list(zip(user_profiles, project_embeddings)), labels, test_size=0.2, random_state=42
)

In [168]:
# Preparing data for the model
user_profiles_train, project_embeddings_train = zip(*X_train)
user_profiles_val, project_embeddings_val = zip(*X_val)

In [169]:
# Convert labels to float32
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')

In [178]:
# Training the model
history = siamese_network.fit(
    [np.array(user_profiles_train), np.array(project_embeddings_train)],
    np.array(y_train),
    validation_data=([np.array(user_profiles_val), np.array(project_embeddings_val)], np.array(y_val)),
    epochs=20,  # You can adjust the number of epochs
    batch_size=16  # And the batch size
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [172]:
def create_user_vector(role: str, languages: list[str]):
  languages_vector = mlb.fit_transform([languages])

  pred_role_preprocessed = bert_preprocess_model([role])
  pred_role_results = bert_model(pred_role_preprocessed)
  pred_roles_embedded = pred_role_results['pooled_output']

  return tf.concat([pred_roles_embedded, languages_vector], axis=1)



In [181]:
user_profile_example = create_user_vector("Full stack web developer at microsoft", ["TypeScript", "Python", "CSS", "HTML", "Go"])
project_embedding_example = bert_model(bert_preprocess_model(["shell scripting"]))['pooled_output']

# Make a prediction
similarity_score = siamese_network.predict([user_profile_example, project_embedding_example])

# Output the similarity score
print("Similarity Score:", similarity_score[0][0])

Similarity Score: 0.44380695
