In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
import ast
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertModel
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
data = pd.read_csv('github_users_dataset.csv')
data.dropna(inplace=True)
data = data[data['role'] != 'role']

In [9]:
data.head(10)

Unnamed: 0,role,experience_level,languages,tech_keywords,projects
1,full-stack SWE • leader & highly effective com...,Beginner,"['JavaScript', 'HTML', 'Vue', 'CSS', 'TypeScri...","['vue', 'random', 'chart', 'grind', 'pinia', '...","[""JavaScript & TypeScript solutions to the 'Gr..."
4,Senior Software Engineer @microsoft,Beginner,"['Shell', 'PowerShell', 'C#', 'Python', 'JavaS...","['github', 'node', 'video', 'action', 'woke', ...",['A collection of tasks to enable execution My...
5,Principal AI Scientist at Genentech. Formerly ...,Beginner,"['Python', 'Shell', 'R', 'Jupyter Notebook', '...","['ecoli_promoter_mpra', 'training', 'women', '...","['DL based processing of atac-seq data', 'Comm..."
7,CEO and Full-Stack Developer at SULLE WAREHOUSE,Beginner,"['C', 'C++', 'CMake', 'Shell', 'Assembly']","['b', 'pixels', 'linux', 'parser', 'expression...","['The README for my personal GitHub account', ..."
8,"Machine learning, quantum computing, and every...",Beginner,"['Python', 'Jupyter Notebook', 'Shell', 'C++',...","['Cybernetics', 'elden_bot', 'quantum', 'Quant...",['Final Project for CSCI 2500 Computer Organiz...
10,"Co-founder, @levelshealth.",Beginner,"['JavaScript', 'Scala', 'CSS', 'HTML', 'PHP']","['sublime', 'play', 'app', 'workflow', 'exampl...","['Alfred App Workflow for caniuse.com', 'Stati..."
12,UX Engineering Consulting. Cocktails.,Intermediate,"['JavaScript', 'HTML', 'CSS', 'Python', 'Shell']","['sentry', 'faker.js', 'route', 'cms', 'tldext...",['Broccoli plugin to add fingerprint checksums...
13,"\n5th year CS Ph.D. @ UC, Irvine\n\n",Beginner,"['Python', 'Shell', 'C++', 'Cuda', 'Jupyter No...","['HRBP', 'lstm_pm_pytorch', 'stingy', 'PPT', '...",['Brain tumor segmentation for Brats15 dataset...
14,@MajorLeagueBaseball Kubernetes SME & Cloud Pl...,Intermediate,"['Shell', 'Dockerfile', 'Go', 'Makefile', 'Jav...","['controller', 'docker', 'chaperone', 'externa...","['🏷️ GitHub Action to add labels', '🎣 GitHub A..."
15,Juggler with a penchant for software development,Intermediate,"['Shell', 'Python', 'JavaScript', 'HTML', 'CSS']","['engine', 'bootstrap_2to3', 'fetch', 'antenna...","[' PKGBUILDs for Arch Linux', 'RESTful HTTP cl..."


In [10]:
# unique experience levels
experience = data['experience_level'].unique()

# map unique experience levels to numbers
# categorical data --> numerical data for one-hot encoding
experience_level_mapping = {level: idx for idx, level in enumerate(experience)}

# w gpt2 nie ma potrzeby one-hot encoding
data['experience_level_num'] = data['experience_level'].map(experience_level_mapping)

# one-hot encoding !!!!!!!!!!!!
experience_level_encoded = to_categorical(data['experience_level_num'])
experience_level_encoded


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [11]:
data['languages'] = data['languages'].apply(ast.literal_eval)
languages = set([lang for sublist in data['languages'].tolist() for lang in sublist])

mlb = MultiLabelBinarizer(classes=sorted(languages))
languages_encoded = mlb.fit_transform(data['languages'])
languages_encoded[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0]])

In [12]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    # Tokenize and prepare the inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')

    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings (for example, using the mean of all token embeddings)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Apply the function to your role column
data['role_embeddings'] = data['role'].apply(lambda x: get_bert_embeddings(x).numpy())

# Now, 'data['role_embeddings']' contains the BERT embeddings for the 'role' text


tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 21.7kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.49MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 7.56MB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 16.1kB/s]
model.safetensors: 100%|██████████| 440M/440M [00:08<00:00, 52.7MB/s] 


KeyboardInterrupt: 