In [33]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [34]:
# Load the data
data = pd.read_csv('github_users_dataset.csv')

# Drop rows with missing values
data.dropna(inplace=True)

# drop rows where the first column is "role"
data = data[data['role'] != 'role']

# Preprocess 'experience_level' using Label Encoding
label_encoder = LabelEncoder()
data['experience_level'] = label_encoder.fit_transform(data['experience_level'])

# Tokenize 'role', 'languages', 'tech_keywords', and 'projects'
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['role'])
tokenizer.fit_on_texts(data['languages'].apply(lambda x: ', '.join(x)))
tokenizer.fit_on_texts(data['tech_keywords'].apply(lambda x: ', '.join(x)))
tokenizer.fit_on_texts(data['projects'].apply(lambda x: ', '.join(x)))

# Function to convert texts to padded sequences
def texts_to_padded_sequences(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=100)

# Convert columns to padded sequences
data['role'] = data['role'].apply(lambda x: texts_to_padded_sequences([x])[0])
data['languages'] = data['languages'].apply(lambda x: texts_to_padded_sequences([', '.join(x)])[0])
data['tech_keywords'] = data['tech_keywords'].apply(lambda x: texts_to_padded_sequences([', '.join(x)])[0])
data['projects'] = data['projects'].apply(lambda x: texts_to_padded_sequences([', '.join(x)])[0])


In [35]:
X = data[['role', 'experience_level', 'languages', 'tech_keywords']]
y = data['projects']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
# Define inputs
role_input = Input(shape=(100,), name="role_input")
exp_input = Input(shape=(1,), name="experience_level_input")
lang_input = Input(shape=(100,), name="languages_input")
tech_input = Input(shape=(100,), name="tech_keywords_input")

# Embeddings for text inputs
embedding = Embedding(input_dim=10000, output_dim=64)
role_embedding = embedding(role_input)
lang_embedding = embedding(lang_input)
tech_embedding = embedding(tech_input)

# LSTM layers for text inputs
role_lstm = LSTM(32)(role_embedding)
lang_lstm = LSTM(32)(lang_embedding)
tech_lstm = LSTM(32)(tech_embedding)

# Concatenate all inputs
concat = Concatenate()([role_lstm, exp_input, lang_lstm, tech_lstm])

# Dense layers
dense1 = Dense(64, activation='relu')(concat)
dropout = Dropout(0.5)(dense1)
output = Dense(100, activation='softmax')(dropout)

# Build and compile the model
model = Model(inputs=[role_input, exp_input, lang_input, tech_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 role_input (InputLayer)     [(None, 100)]                0         []                            
                                                                                                  
 languages_input (InputLaye  [(None, 100)]                0         []                            
 r)                                                                                               
                                                                                                  
 tech_keywords_input (Input  [(None, 100)]                0         []                            
 Layer)                                                                                           
                                                                                            

In [37]:
# Prepare the data for training
X_train_dict = {
    'role_input': np.array(X_train['role'].tolist()),
    'experience_level_input': np.array(X_train['experience_level']),
    'languages_input': np.array(X_train['languages'].tolist()),
    'tech_keywords_input': np.array(X_train['tech_keywords'].tolist())
}
y_train_array = np.array(y_train.tolist())

In [38]:
# Train the model
model.fit(X_train_dict, y_train_array, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x144f4df10>

In [39]:
def preprocess_user_data(role, experience_level, languages, tech_keywords):
    role_seq = texts_to_padded_sequences([role])[0]
    exp_level_seq = label_encoder.transform([experience_level])[0]
    lang_seq = texts_to_padded_sequences([', '.join(languages)])[0]
    tech_seq = texts_to_padded_sequences([', '.join(tech_keywords)])[0]
    return {
        'role_input': np.array([role_seq]),
        'experience_level_input': np.array([exp_level_seq]),
        'languages_input': np.array([lang_seq]),
        'tech_keywords_input': np.array([tech_seq])
    }

# Example user data
example_user = preprocess_user_data(
    role="@MajorLeagueBaseball Kubernetes SME & Cloud Platform Engineer; @cncf Ambassador Emeritus",
    experience_level="Intermediate",
    languages=['Shell', 'Dockerfile', 'Go', 'Makefile', 'JavaScript'],
    tech_keywords=['controller', 'docker', 'chaperone', 'external', 'automerge', 'action', 'pull', 'tools', 'merged', 'akuity']
)

# Predict
predicted_projects = model.predict(example_user)
# Convert predictions to actual project descriptions (you need a reverse mapping from sequences to text)




In [40]:
# Assuming 'predicted_projects' is the output of your model
top_n = 5  # Number of top tokens to consider for each project description
top_project_tokens = np.argsort(predicted_projects, axis=-1)[:, -top_n:]


In [41]:
# Create a reverse mapping from token to word
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function to convert tokens to words
def tokens_to_words(tokens):
    words = [reverse_word_map.get(token, '') for token in tokens]
    return ' '.join(words).strip()

# Convert top tokens to words
project_descriptions = [tokens_to_words(tokens) for tokens in top_project_tokens]
project_descriptions


['⃣ m 的 computer principal']

In [42]:
def clean_description(description):
    # Example function to clean up descriptions - customize as needed
    words = description.split()
    # Remove duplicates, keep the order
    cleaned_words = sorted(set(words), key=lambda x: words.index(x))
    return ' '.join(cleaned_words)

cleaned_project_descriptions = [clean_description(desc) for desc in project_descriptions]
cleaned_project_descriptions

['⃣ m 的 computer principal']