In [None]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.15.0


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import ast
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model

In [None]:
data = pd.read_csv('github_users_dataset.csv', header=None)

initial_rows = data.shape[0]

data.dropna(inplace=True)

# Set the column names from the first row
data.columns = data.iloc[0]

# Drop the first row
data = data[1:]

# Drop all rows that are the same as the column names
data = data[~data.eq(data.columns).all(1)]

# Function to check if all strings in a given input are ASCII
def all_strings_are_ascii(input):
    if isinstance(input, list):
        return all(str(s).isascii() for s in input)
    else:
        return str(input).isascii()

# Apply the function to each element of the DataFrame
data_ascii = data.applymap(all_strings_are_ascii)

# Filter out the rows where all elements are ASCII
data = data[data_ascii.all(axis=1)]

data = data.query('projects != "[]" and languages != "[]"')

final_rows = data.shape[0]

print(f'Rows removed: {initial_rows - final_rows}, {100 * (initial_rows - final_rows) / initial_rows:.2f}% of the original dataset')

Rows removed: 954, 63.90% of the original dataset


In [None]:
data.head(10)

Unnamed: 0,role,experience_level,languages,tech_keywords,projects
5,Senior Software Engineer @microsoft,Beginner,"['Shell', 'PowerShell', 'C#', 'Python', 'JavaS...","['github', 'node', 'video', 'action', 'woke', ...",['A collection of tasks to enable execution My...
6,Principal AI Scientist at Genentech. Formerly ...,Beginner,"['Python', 'Shell', 'R', 'Jupyter Notebook', '...","['ecoli_promoter_mpra', 'training', 'women', '...","['DL based processing of atac-seq data', 'Comm..."
8,CEO and Full-Stack Developer at SULLE WAREHOUSE,Beginner,"['C', 'C++', 'CMake', 'Shell', 'Assembly']","['b', 'pixels', 'linux', 'parser', 'expression...","['The README for my personal GitHub account', ..."
9,"Machine learning, quantum computing, and every...",Beginner,"['Python', 'Jupyter Notebook', 'Shell', 'C++',...","['Cybernetics', 'elden_bot', 'quantum', 'Quant...",['Final Project for CSCI 2500 Computer Organiz...
11,"Co-founder, @levelshealth.",Beginner,"['JavaScript', 'Scala', 'CSS', 'HTML', 'PHP']","['sublime', 'play', 'app', 'workflow', 'exampl...","['Alfred App Workflow for caniuse.com', 'Stati..."
17,Lead Engineer at Nike.\n\nMostly cloud service...,Intermediate,"['JavaScript', 'TypeScript', 'PHP', 'CSS', 'HT...","['mee.js', 'Advanced', 'class.js', 'node', 'cu...",['SecretParser Plugin for the game http://www....
21,Physical oceanographer with a love of open sou...,Intermediate,"['Jupyter Notebook', 'Python', 'HTML', 'Shell'...","['MLINT', 'Notebooks', 'MITgcm66h', 'publicati...",['Repo for the April 10-12 workshop to be held...
22,"Tinker, build, make, share.",Intermediate,"['C++', 'C', 'JavaScript', 'HTML', 'CSS']","['ESP32Tests', 'eyeball_simulator', 'BlinkRC7'...",['Abandoned: attempt to make 15-tile sliding p...
26,:-),Intermediate,"['Python', 'Shell', 'C', 'Java', 'C++']","['MSRnet', 'codeBase', 'discord', 'CO224_Compu...",['ABS: Scanning Neural Networks for Back-doors...
28,"Signal processing, radar, data science",Intermediate,"['Shell', 'Batchfile', 'Python', 'Makefile', 'C']","['setup', 'asciimatics', 'adwaita', 'icon', 'c...",['A conda-smithy repository for adwaita-icon-t...


OHE for experience and language

In [None]:
# unique experience levels
experience = data['experience_level'].unique()

# map unique experience levels to numbers
# categorical data --> numerical data for one-hot encoding
experience_level_mapping = {level: idx for idx, level in enumerate(experience)}

# w gpt2 nie ma potrzeby one-hot encoding
data['experience_level_num'] = data['experience_level'].map(experience_level_mapping)

# one-hot encoding !!!!!!!!!!!!
experience_level_encoded = to_categorical(data['experience_level_num'])
experience_level_encoded


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [None]:
data['languages'] = data['languages'].apply(ast.literal_eval)
languages = set([lang for sublist in data['languages'].tolist() for lang in sublist])

mlb = MultiLabelBinarizer(classes=sorted(languages))
languages_encoded = mlb.fit_transform(data['languages'])
languages_encoded[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])

BERT for role and project

In [None]:
bert_preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_model_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'

bert_preprocess_model = hub.KerasLayer(bert_preprocess_url)
bert_model = hub.KerasLayer(bert_model_url)

role_texts = data['role'].tolist()  # Convert 'role' column to a list

role_preprocessed = bert_preprocess_model(role_texts)

role_results = bert_model(role_preprocessed)
role_results.keys()

dict_keys(['pooled_output', 'default', 'encoder_outputs', 'sequence_output'])

In [None]:
roles_embedded = role_results['pooled_output']

roles_embedded.shape

TensorShape([539, 128])

In [None]:
projects = [j for i in data['projects'].tolist() for j in ast.literal_eval(i)]

projects_preprocessed = bert_preprocess_model(projects)
projects_results = bert_model(projects_preprocessed)


In [None]:
map_user_to_projects = {}
for user_num, projects_list in enumerate(data['projects']):
  map_user_to_projects[user_num] = ast.literal_eval(projects_list)

map_user_to_projects[0]


['A collection of tasks to enable execution MySQL commands or scripts by VSTS/TFS Windows Agents.',
 'Microsoft Azure Workshop for Developers',
 'PowerShell module for interacting with Bluetooth Low Energy (BLE) devices implementing the Generic Attribute Profile (GATT/GAP)',
 'Pre-commit hooks collection that utilizes ChatGPT and OpenAI platform to validate changes made to the codebase.',
 'This GitHub Action allows you to run Gitleaks in your GitHub workflow.',
 'A MkDocs plugin that lets you exclude/include docs files using globs, regexes, gitignore-style file and Markdown/FrontMatter tags metadata.',
 'MkDocs Video plugin',
 'OSC message decoder/encoder with fault tolerant',
 'Detect non-inclusive language in your source code.']

In [None]:
projects_results.keys()

dict_keys(['pooled_output', 'default', 'encoder_outputs', 'sequence_output'])

In [None]:
projects_embedded = projects_results['pooled_output']

In [None]:
for i in [experience_level_encoded,
          languages_encoded,
          roles_embedded,
          projects_embedded]: print(i.shape)

(539, 3)
(539, 95)
(539, 128)
(10437, 128)


In [None]:
user_profiles = tf.concat([roles_embedded, experience_level_encoded, languages_encoded], axis=1)
user_profiles.shape

TensorShape([539, 226])

In [None]:
user_profiles[0]

<tf.Tensor: shape=(226,), dtype=float32, numpy=
array([-0.9998977 ,  0.02674753, -0.95229065,  0.07566944, -0.98473793,
        0.8515617 , -0.9891793 , -0.06120098,  0.07552765, -0.00853382,
       -0.33273837, -0.07438556, -0.13768643,  0.99698657,  0.20532432,
        0.00142703,  0.9212176 ,  0.066608  , -0.86612004, -0.28353733,
        0.5697272 ,  0.06278628, -0.8876335 , -0.82182   , -0.9968265 ,
       -0.06983151, -0.9948706 ,  0.974122  ,  0.81316996,  0.1028368 ,
        0.09937278, -0.0886519 , -0.86351496, -0.9690397 ,  0.7592347 ,
        0.996806  , -0.92572796, -0.00526048,  0.51379555, -0.9914838 ,
        0.9527194 ,  0.9840963 , -0.99205375,  0.84864724, -0.88819516,
       -0.19925441, -0.86289763,  0.9918394 , -0.5801045 ,  0.9990769 ,
       -0.5767053 , -0.9606845 , -0.01358342,  0.89274883,  0.8692733 ,
        0.59175795, -0.12701385,  0.19582532,  0.9900831 ,  0.7635511 ,
        0.08241388, -0.01632024, -0.21319991,  0.7888896 , -0.6095279 ,
       -0.998157

In [None]:
for i in range(20):
  print(f"User {i}: {map_user_to_projects[i]}")

User 0: ['A collection of tasks to enable execution MySQL commands or scripts by VSTS/TFS Windows Agents.', 'Microsoft Azure Workshop for Developers', 'PowerShell module for interacting with Bluetooth Low Energy (BLE) devices implementing the Generic Attribute Profile (GATT/GAP)', 'Pre-commit hooks collection that utilizes ChatGPT and OpenAI platform to validate changes made to the codebase.', 'This GitHub Action allows you to run Gitleaks in your GitHub workflow.', 'A MkDocs plugin that lets you exclude/include docs files using globs, regexes, gitignore-style file and Markdown/FrontMatter tags metadata.', 'MkDocs Video plugin', 'OSC message decoder/encoder with fault tolerant', 'Detect non-inclusive language in your source code.']
User 1: ['DL based processing of atac-seq data', 'Community-curated list of software packages and data resources for single-cell, including RNA-seq, ATAC-seq, etc.', 'Hackathon team: gene expression analysis for Covid-19', 'Examples of single-cell genomic an

In [None]:
data.head(15)

Unnamed: 0,role,experience_level,languages,tech_keywords,projects,experience_level_num
5,Senior Software Engineer @microsoft,Beginner,"[Shell, PowerShell, C#, Python, JavaScript]","['github', 'node', 'video', 'action', 'woke', ...",['A collection of tasks to enable execution My...,0
6,Principal AI Scientist at Genentech. Formerly ...,Beginner,"[Python, Shell, R, Jupyter Notebook, Dockerfile]","['ecoli_promoter_mpra', 'training', 'women', '...","['DL based processing of atac-seq data', 'Comm...",0
8,CEO and Full-Stack Developer at SULLE WAREHOUSE,Beginner,"[C, C++, CMake, Shell, Assembly]","['b', 'pixels', 'linux', 'parser', 'expression...","['The README for my personal GitHub account', ...",0
9,"Machine learning, quantum computing, and every...",Beginner,"[Python, Jupyter Notebook, Shell, C++, Starlark]","['Cybernetics', 'elden_bot', 'quantum', 'Quant...",['Final Project for CSCI 2500 Computer Organiz...,0
11,"Co-founder, @levelshealth.",Beginner,"[JavaScript, Scala, CSS, HTML, PHP]","['sublime', 'play', 'app', 'workflow', 'exampl...","['Alfred App Workflow for caniuse.com', 'Stati...",0
17,Lead Engineer at Nike.\n\nMostly cloud service...,Intermediate,"[JavaScript, TypeScript, PHP, CSS, HTML]","['mee.js', 'Advanced', 'class.js', 'node', 'cu...",['SecretParser Plugin for the game http://www....,1
21,Physical oceanographer with a love of open sou...,Intermediate,"[Jupyter Notebook, Python, HTML, Shell, TeX]","['MLINT', 'Notebooks', 'MITgcm66h', 'publicati...",['Repo for the April 10-12 workshop to be held...,1
22,"Tinker, build, make, share.",Intermediate,"[C++, C, JavaScript, HTML, CSS]","['ESP32Tests', 'eyeball_simulator', 'BlinkRC7'...",['Abandoned: attempt to make 15-tile sliding p...,1
26,:-),Intermediate,"[Python, Shell, C, Java, C++]","['MSRnet', 'codeBase', 'discord', 'CO224_Compu...",['ABS: Scanning Neural Networks for Back-doors...,1
28,"Signal processing, radar, data science",Intermediate,"[Shell, Batchfile, Python, Makefile, C]","['setup', 'asciimatics', 'adwaita', 'icon', 'c...",['A conda-smithy repository for adwaita-icon-t...,1


In [None]:
dataset = []

In [None]:
len(map_user_to_projects[1])

10

In [None]:
start_project_range = 0
for user_num, user_profile in enumerate(user_profiles):
  for project_embedding in projects_embedded[start_project_range:start_project_range+len(map_user_to_projects[user_num])]:
    dataset.append((user_profile, project_embedding, 1))
  start_project_range += len(map_user_to_projects[user_num])
  if user_num == 4: break


better manual method

In [None]:
# manual small dataset testing

def find_project_start_index_for_given_user(user_num):
  i = 0
  index = 0
  while i != user_num:
    index += len(map_user_to_projects[i])
    i += 1
  return index

def add_negative_example_given_non_matching_user_profiles_manually(user1: int, non_matching_user2: int):
  start_index = find_project_start_index_for_given_user(non_matching_user2)
  for i in range(len(map_user_to_projects[non_matching_user2])):
    dataset.append((user_profiles[user1], projects_embedded[start_index + i], 0))

In [None]:
add_negative_example_given_non_matching_user_profiles_manually(0, 1)
add_negative_example_given_non_matching_user_profiles_manually(1, 2)
add_negative_example_given_non_matching_user_profiles_manually(3, 5)
add_negative_example_given_non_matching_user_profiles_manually(4, 14)


In [None]:
len(dataset)

137

In [None]:
dataset[0]

(<tf.Tensor: shape=(226,), dtype=float32, numpy=
 array([-0.9998977 ,  0.02674753, -0.95229065,  0.07566944, -0.98473793,
         0.8515617 , -0.9891793 , -0.06120098,  0.07552765, -0.00853382,
        -0.33273837, -0.07438556, -0.13768643,  0.99698657,  0.20532432,
         0.00142703,  0.9212176 ,  0.066608  , -0.86612004, -0.28353733,
         0.5697272 ,  0.06278628, -0.8876335 , -0.82182   , -0.9968265 ,
        -0.06983151, -0.9948706 ,  0.974122  ,  0.81316996,  0.1028368 ,
         0.09937278, -0.0886519 , -0.86351496, -0.9690397 ,  0.7592347 ,
         0.996806  , -0.92572796, -0.00526048,  0.51379555, -0.9914838 ,
         0.9527194 ,  0.9840963 , -0.99205375,  0.84864724, -0.88819516,
        -0.19925441, -0.86289763,  0.9918394 , -0.5801045 ,  0.9990769 ,
        -0.5767053 , -0.9606845 , -0.01358342,  0.89274883,  0.8692733 ,
         0.59175795, -0.12701385,  0.19582532,  0.9900831 ,  0.7635511 ,
         0.08241388, -0.01632024, -0.21319991,  0.7888896 , -0.6095279 ,
  

In [None]:
import random

# Shuffle the dataset
random.shuffle(dataset)

In [None]:
# Split the dataset into features and labels
features = [(user_profile, project_embedding) for user_profile, project_embedding, _ in dataset]
labels = [label for _, _, label in dataset]

In [None]:
# Convert to numpy arrays or tensors as required for training
features = np.array(features)
labels = np.array(labels)

  features = np.array(features)


In [None]:
len(labels)

137

In [None]:
# Hyperparameters (you can adjust these based on your needs)
embedding_size = 128  # Size of the final embeddings
dense_units = 64      # Number of units in the dense layers
dropout_rate = 0.2    # Dropout rate for regularization

# User Profile Branch
user_input = Input(shape=(226,))  # Adjust the shape based on your concatenated user profile tensor
user_branch = Dense(dense_units, activation='relu')(user_input)
user_branch = tf.keras.layers.Dropout(dropout_rate)(user_branch)
user_branch = Dense(embedding_size, activation='relu')(user_branch)

# Project Description Branch
project_input = Input(shape=(128,))  # Adjust the shape based on your BERT embeddings
project_branch = Dense(dense_units, activation='relu')(project_input)
project_branch = tf.keras.layers.Dropout(dropout_rate)(project_branch)
project_branch = Dense(embedding_size, activation='relu')(project_branch)

In [None]:
# Distance Layer
def euclidean_distance(vectors):
    x, y = vectors
    sum_square = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

distance = Lambda(euclidean_distance)([user_branch, project_branch])

# Siamese Network Model
siamese_network = Model(inputs=[user_input, project_input], outputs=distance)

# Contrastive Loss Function
def contrastive_loss(y_true, y_pred):
    margin = 1
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

# Define a custom accuracy metric
def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return tf.keras.metrics.binary_accuracy(y_true, tf.cast(y_pred < 0.5, dtype=tf.float32))

# Compile the model with the custom accuracy metric
siamese_network.compile(optimizer='adam', loss=contrastive_loss, metrics=[accuracy])

# Model Summary
siamese_network.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 226)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 dense (Dense)               (None, 64)                   14528     ['input_1[0][0]']             
                                                                                                  
 dense_2 (Dense)             (None, 64)                   8256      ['input_2[0][0]']             
                                                                                              

In [None]:
# prepare data for training

user_profiles, project_embeddings = zip(*features)
user_profiles = np.array(user_profiles)
project_embeddings = np.array(project_embeddings)
labels = np.array(labels)

In [None]:
# Splitting data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    list(zip(user_profiles, project_embeddings)), labels, test_size=0.2, random_state=42
)

In [None]:
# Preparing data for the model
user_profiles_train, project_embeddings_train = zip(*X_train)
user_profiles_val, project_embeddings_val = zip(*X_val)

In [None]:
# Convert labels to float32
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')

In [None]:
# Training the model
history = siamese_network.fit(
    [np.array(user_profiles_train), np.array(project_embeddings_train)],
    np.array(y_train),
    validation_data=([np.array(user_profiles_val), np.array(project_embeddings_val)], np.array(y_val)),
    epochs=100,  # You can adjust the number of epochs
    batch_size=32  # And the batch size
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
def create_user_vector(role: str, experience_level: str, languages: list[str]):
  OHE_types = to_categorical([0,1,2])
  if experience_level == "Beginner":
    experience_vector = OHE_types[0]
  elif experience_level == "Intermediate":
    experience_vector = OHE_types[1]
  else:
    experience_vector = OHE_types[2]

  # Add a new dimension to make it a tensor of shape (1, 3)
  experience_vector = tf.expand_dims(experience_vector, axis=0)

  languages_vector = mlb.fit_transform([languages])

  pred_role_preprocessed = bert_preprocess_model([role])
  pred_role_results = bert_model(pred_role_preprocessed)
  pred_roles_embedded = pred_role_results['pooled_output']

  return tf.concat([pred_roles_embedded, experience_vector, languages_vector], axis=1)



In [None]:
user_profile_example = create_user_vector("Principal AI Scientist at Genentech. Formerly ", "Beginner", ["Python", "Shell", "R", "Jupyter Notebook", "Dockerfile"])
project_embedding_example = bert_model(bert_preprocess_model(['Community-curated list of software packages and data resources for single-cell, including RNA-seq, ATAC-seq, etc.']))['pooled_output']

# Make a prediction
similarity_score = siamese_network.predict([user_profile_example, project_embedding_example])

# Output the similarity score
print("Similarity Score:", similarity_score[0][0])

Similarity Score: 0.76330703
