# Codeforces Problem Recommender

### Import The Required Libraries

In [101]:
import math
import torch
from torch import nn, Tensor
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pandas as pd
import numpy as np
import requests

### Enter Your Codeforces Username Here

In [102]:
target_user_handle = 'shlokagrawal'

### Retrieving User Information from Codeforces API

In [103]:
url = "https://codeforces.com/api/user.info?handles=" + target_user_handle

# Make a GET request to the API endpoint
response = requests.get(url)
if response.status_code == 200:
    data = response.json()

    if 'status' in data and data['status'] == 'OK':
        user_info = data['result'][0]  
        print(f"Handle: {user_info['handle']}")
        rating = user_info['rating']
        print(f"Rating: {rating}")
    else:
        print("API request was not successful.")
else:
    print(f"Request failed with status code: {response.status_code}")

Handle: shlokagrawal
Rating: 1911


### Importing The Required CSV Files

In [104]:
if rating < 1200:
    path = "data/newbie_to_pupil/"
elif rating >= 1200 and rating < 1400:
    path = "data/pupil_to_specialist/"
elif rating >= 1400 and rating < 1600:
    path = "data/specialist_to_expert/"
elif rating >= 1600 and rating < 1900:
    path = "data/expert_to_cm/"
elif rating >= 1900 and rating < 2100:
    path = "data/cm_to_m/"
elif rating >= 2100 and rating < 2300:
    path = "data/m_to_im/"
elif rating >= 2300 and rating < 2400:
    path = "data/im_to_gm/"
elif rating >= 2400 and rating < 2600:
    path = "data/gm_to_igm/"
elif rating >= 2600 and rating < 3000:
    path = "data/igm_to_lgm/"
else:
    print("Congrats on being a Legendary Grand Master!")
    path = "data/igm_to_lgm/"

model_path = "models" + path[4:-1] + ".pth"

interactions = pd.read_csv(path + "user_problem.csv")
user_tags = pd.read_csv(path + "user_tags.csv")
user_ratings = pd.read_csv(path + "user_ratings.csv")

### Pre-Processing

In [105]:
interactions.drop(columns=['problem_rating', 'problem_tags'], inplace=True)
user_ratings.drop(columns=['undefined'], inplace=True)
user_tags.rename(columns={'0user_handle': 'user_id'}, inplace=True)
user_ratings.rename(columns={'0user_handle': 'user_id'}, inplace=True)
interactions.rename(columns={'user_handle': 'user_id'}, inplace=True)

In [106]:
null_percentage = user_tags.isna().mean() * 100
# Get names of columns with more than 50% null values
columns_to_drop = null_percentage[null_percentage > 50].index

# Drop columns with more than 50% null values
user_tags = user_tags.drop(columns=columns_to_drop)

null_percentage = user_ratings.isna().mean() * 100

In [107]:
interactions_copy = interactions.copy()
# Get names of columns with more than 50% null values
columns_to_drop = null_percentage[null_percentage > 60].index

# Drop columns with more than 50% null values
user_ratings = user_ratings.drop(columns=columns_to_drop)

In [108]:
problems_per_user = interactions.groupby('user_id').problem_id.count()
users_per_problem = interactions.groupby('problem_id').user_id.count()

print(f"Total No. of users: {len(interactions.user_id.unique())}")
print(f"Total No. of problems: {len(interactions.problem_id.unique())}")
print("\n")

print(f"Max no. of problems per user: {problems_per_user.max()}")
print(f"Min no. of problems per user: {problems_per_user.min()}")
print(f"Median no. of problems per user: {problems_per_user.median()}")
print("\n")

print(f"Max no. of users per problem: {users_per_problem.max()}")
print(f"Min no. of users per problem: {users_per_problem.min()}")
print(f"Median no. of users per problem: {users_per_problem.median()}")

Total No. of users: 185
Total No. of problems: 7607


Max no. of problems per user: 738
Min no. of problems per user: 62
Median no. of problems per user: 111.0


Max no. of users per problem: 24
Min no. of users per problem: 1
Median no. of users per problem: 3.0


In [109]:
user_tags.fillna(0, inplace=True)
user_ratings.fillna(0, inplace=True)

In [110]:
user_tags

Unnamed: 0,*special,user_id,binary search,bitmasks,brute force,combinatorics,constructive algorithms,data structures,dfs and similar,divide and conquer,...,interactive,math,matrices,number theory,probabilities,shortest paths,sortings,strings,trees,two pointers
0,0.0,maspy,32.0,11.0,56,6.0,45,34,29.0,4.0,...,0.0,91,1.0,15.0,0.0,12.0,55,24.0,18.0,21.0
1,0.0,wsyear,25.0,15.0,47,18.0,61,46,27.0,4.0,...,6.0,79,1.0,17.0,3.0,3.0,24,11.0,24.0,15.0
2,0.0,LXH-cat,51.0,23.0,84,51.0,118,94,53.0,14.0,...,27.0,161,7.0,43.0,18.0,13.0,54,24.0,46.0,29.0
3,1.0,skittles1412,3.0,1.0,5,0.0,8,1,1.0,0.0,...,0.0,14,0.0,0.0,0.0,0.0,4,8.0,1.0,1.0
4,0.0,PurpleCrayon,30.0,28.0,77,18.0,101,68,54.0,9.0,...,6.0,181,5.0,36.0,4.0,9.0,60,41.0,38.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.0,Osmabnlden,28.0,15.0,51,25.0,49,29,20.0,2.0,...,0.0,107,1.0,30.0,2.0,2.0,30,26.0,12.0,18.0
181,4.0,Carmel_Ab1,100.0,49.0,226,40.0,181,104,61.0,8.0,...,19.0,361,2.0,99.0,6.0,15.0,158,106.0,28.0,60.0
182,26.0,valavshonok,42.0,23.0,106,29.0,86,40,32.0,4.0,...,4.0,232,1.0,39.0,8.0,10.0,72,69.0,10.0,30.0
183,0.0,Erinyes,33.0,20.0,54,8.0,77,26,24.0,9.0,...,9.0,65,2.0,18.0,2.0,9.0,42,11.0,14.0,21.0


In [111]:
user_ratings

Unnamed: 0,user_id,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,800,900
0,maspy,13.0,13.0,23.0,23,14.0,24.0,17.0,26.0,19.0,26.0,17.0,24.0,18.0,3.0,3.0,43,16.0
1,wsyear,15.0,6.0,17.0,8,8.0,8.0,11.0,15.0,12.0,6.0,12.0,10.0,9.0,9.0,7.0,33,5.0
2,LXH-cat,16.0,13.0,17.0,12,14.0,15.0,33.0,33.0,27.0,18.0,28.0,24.0,21.0,19.0,12.0,47,4.0
3,skittles1412,3.0,1.0,3.0,1,2.0,2.0,4.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,16,1.0
4,PurpleCrayon,31.0,22.0,27.0,31,32.0,38.0,29.0,32.0,23.0,25.0,18.0,16.0,9.0,13.0,16.0,77,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,Osmabnlden,11.0,13.0,10.0,12,14.0,11.0,13.0,18.0,18.0,21.0,10.0,8.0,3.0,1.0,1.0,46,11.0
181,Carmel_Ab1,81.0,77.0,80.0,104,65.0,110.0,57.0,57.0,37.0,32.0,17.0,14.0,2.0,2.0,2.0,242,74.0
182,valavshonok,37.0,43.0,19.0,64,76.0,22.0,22.0,7.0,8.0,1.0,1.0,5.0,0.0,1.0,0.0,282,22.0
183,Erinyes,5.0,7.0,11.0,10,8.0,4.0,8.0,6.0,5.0,15.0,38.0,23.0,6.0,4.0,1.0,40,10.0


In [112]:
interactions["problem_id"] = interactions["problem_id"].apply(lambda x: f"problem_{x}")
interactions["user_id"] = interactions["user_id"].apply(lambda x: f"user_{x}")
user_tags["user_id"] = user_tags["user_id"].apply(lambda x: f"user_{x}")
user_ratings["user_id"] = user_ratings["user_id"].apply(lambda x: f"user_{x}")

### Creating Vocabulary

In [113]:
np.random.seed(42)
# Generating a list of unique problem ids
problem_ids = interactions.problem_id.unique()

# Counter is used to feed problems to movive_vocab
problem_counter = Counter(problem_ids)

# Genarting vocabulary
problem_vocab = vocab(problem_counter, specials=['<unk>'])

# For indexing input ids
problem_vocab_stoi = problem_vocab.get_stoi()

# problem to title mapping dictionary
# problem_title_dict = dict(zip(problems.problem_id, problems.title))

# Similarly generating a vocabulary for user ids
user_ids = interactions.user_id.unique()
user_counter = Counter(user_ids)
user_vocab = vocab(user_counter, specials=['<unk>'])
user_vocab_stoi = user_vocab.get_stoi()

### GRU Model Definition

In [114]:
class GRUModel(nn.Module):
    def __init__(self, ntoken: int, nuser: int, d_model: int, d_hid: int, nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'GRU'

        # Embedding layers
        self.problem_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)
        self.feature_embedding = nn.Embedding(1000, d_model)

        # GRU layers
        self.GRU = nn.LSTM(d_model, d_hid, nlayers, batch_first=True, dropout=dropout)

        self.d_model = d_model

        # Linear layer to map the GRU output to problem vocabulary
        self.linear = nn.Linear(48*d_hid, ntoken)
#         self.linear = nn.Sequential(
#             nn.Linear(31*d_hid,31*d_hid//2),
#             nn.Linear(31*d_hid//2, 31*d_hid//4),
#             nn.Dropout(0.2),
#             nn.Linear(31*d_hid//4, 31*d_hid//8),
#             nn.Linear(31*d_hid//8, ntoken)
#         )

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers
        initrange = 0.1
        self.problem_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.feature_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, user: Tensor, other_features_batch) -> Tensor:
        # Embedding problem ids and user id
        
#         print("user shape",user.shape)
        problem_embed = self.problem_embedding(src)* math.sqrt(self.d_model)
        user_embed = self.user_embedding(user)* math.sqrt(self.d_model)
        
        # print("problem_embed shape:", problem_embed.shape)
#         print("user_embed shape:", user_embed.shape)

        # Pass the combined embeddings through GRU layers
        GRU_output, _ = self.GRU(problem_embed)
        
#         print("GRU_output", GRU_output.shape)
        
        other_features_embed = []
        for i, (feature_name, feature_tensor) in enumerate(other_features_batch.items()):
#             print(f"Processing other feature '{feature_name}'")
#             print("Feature tensor shape:", feature_tensor.shape)
            other_feature_embedding = self.feature_embedding(feature_tensor)* math.sqrt(self.d_model)
            other_features_embed.append(other_feature_embedding)
#             print("Other feature embedding shape:", other_feature_embedding.shape)
        
        
        other_features_embed = torch.cat(other_features_embed, dim=-1)
        other_features_embed = other_features_embed.expand(-1, GRU_output.size(1), -1)

        user_embed = user_embed.expand(-1, GRU_output.size(1), -1)
#         print("user_embed after expansion", user_embed.shape)

        output = torch.cat((GRU_output, user_embed, other_features_embed), dim=-1)

        # print("hello", output.shape)

        # Apply linear layer to obtain the output logits
        output = self.linear(output)

        return output


### Defining Hyperparameters and Loading the Model

In [115]:
ntokens = len(problem_vocab)  # size of vocabulary
nusers = len(user_vocab)
d_model = 128  # embedding dimension (maybe 512?)
d_hid = 128  # dimension of the GRU hidden states
nlayers = 2  # number of GRU layers
dropout = 0.2  # dropout probability

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GRUModel(ntokens, nusers, d_model, d_hid, nlayers, dropout).to(device)

# Load the model's state dictionary
model.load_state_dict(torch.load("models/cm_to_m_new.pth"))

<All keys matched successfully>

### Code for Generating Recommendations

In [116]:
def generate_recommendation(data, problem_set, k=10):
    model.eval()
    user_id = data[0]
    problem_sequence = data[1]
    input_sequence = problem_sequence[:-1]
    # Tokenize and numerically encode the user id and problem sequence
    user_tensor = torch.tensor(user_vocab_stoi[user_id])
    problem_tensor = torch.tensor([[problem_vocab_stoi[problem_id]] for problem_id in input_sequence])
    # Shape: [1, 1]
    user_tensor = user_tensor.unsqueeze(0).to(device)
    user_tensor = user_tensor.view(user_tensor.shape[0], 1)

    # Shape: [1, seq_length]
    problem_tensor = problem_tensor.unsqueeze(0).to(device)[0]
    problem_tensor = problem_tensor.view(1, problem_tensor.shape[0])
    
    other_features_batch={}
    
    for i, feature in enumerate(data[2:]):
            feature_name = f'feature_{i+1}'
            other_features_batch[feature_name] = torch.tensor(feature).int()
            other_features_batch[feature_name] = other_features_batch[feature_name].unsqueeze(0).to(device)
            other_features_batch[feature_name] = other_features_batch[feature_name].view(other_features_batch[feature_name].shape[0], 1)
    
#     print("user_tensor", user_tensor.shape)
#     print("problem_tensor", problem_tensor.shape)

    # Pass the tensors through the model
    with torch.no_grad():
        predictions = model(problem_tensor, user_tensor, other_features_batch)

    # The output is a probability distribution over the next problem.
    # Topk to get most probable problems
    values, indices = predictions.topk(k + len(input_sequence), dim=-1)

    # Eliminate already watched problems
    indices = [indice for indice in indices[-1, :][0] if indice not in problem_tensor]
    predicted_problems_tmp = [problem_vocab.get_itos()[problem] for problem in indices]
    count = 0
    predicted_problems = []
    i = 0
    while count < k and i < len(predicted_problems_tmp):
        if predicted_problems_tmp[i] not in problem_set:
            predicted_problems.append(predicted_problems_tmp[i])
            count += 1
        i += 1
    #print (len(predicted_problems_tmp))
    return predicted_problems

### Getting User Features from Codeforces API

In [117]:
# Define the URL of the API endpoint you want to call
url = "https://codeforces.com/api/user.status?handle=" + target_user_handle

# Make a GET request to the API endpoint
response = requests.get(url)
data = response.json()

target_tags = {}
target_ratings = {}
problem_set = set()

for submission in data['result']:
    if 'verdict' in submission and submission['verdict'] == 'OK':
        for tag in submission['problem']['tags']:
            target_tags[tag] = target_tags.get(tag, 0) + 1
        if 'rating' in submission['problem']:
            target_ratings[submission['problem']['rating']] = target_ratings.get(submission['problem']['rating'], 0) + 1
        if 'contestId' in submission['problem'] :
            problem_set.add('problem_' + str(submission['problem']['contestId']) + ':' + submission['problem']['index'])

user_ratings.drop(columns=['user_id'], inplace=True)
user_tags.drop(columns=['user_id'], inplace=True)

all_tags = user_tags.columns.tolist()
all_ratings = user_ratings.columns.tolist()
features = []

for tag in all_tags:
    if tag != 'user_id' and tag in target_tags:
        features.append(target_tags[tag])
    else:
        features.append(0)

for rating in all_ratings:
    if rating != 'user_id' and int(rating) in target_ratings:
        features.append(target_ratings[int(rating)])
    else:
        features.append(0)

sequence_length = 20
problems = set(interactions_copy.problem_id.unique())
count = 0
stack = []
for submission in data['result']:
    if 'verdict' in submission and submission['verdict'] == "OK" and count < sequence_length:
        key = str(submission['problem']['contestId']) + ':' + str(submission['problem']['index'])
        encoding = 'problem_' + key
        if key in problems and encoding not in stack:
            stack.append(encoding)
            count += 1
    if count >= sequence_length:
        break
stack = stack[::-1]
print(stack)

['problem_1928:E', 'problem_1932:G', 'problem_1923:E', 'problem_1938:H', 'problem_1938:C', 'problem_1938:G', 'problem_1938:J', 'problem_1938:E', 'problem_1938:F', 'problem_1948:E', 'problem_1943:C', 'problem_1943:B', 'problem_1743:F', 'problem_1550:D', 'problem_1949:I', 'problem_1949:C', 'problem_1949:B', 'problem_102056:I', 'problem_1942:D', 'problem_1942:E']


In [118]:
len(features)

46

In [119]:
input = ['<unk>', stack] + features
input = np.array(input, dtype=object)
input

array(['<unk>',
       list(['problem_1928:E', 'problem_1932:G', 'problem_1923:E', 'problem_1938:H', 'problem_1938:C', 'problem_1938:G', 'problem_1938:J', 'problem_1938:E', 'problem_1938:F', 'problem_1948:E', 'problem_1943:C', 'problem_1943:B', 'problem_1743:F', 'problem_1550:D', 'problem_1949:I', 'problem_1949:C', 'problem_1949:B', 'problem_102056:I', 'problem_1942:D', 'problem_1942:E']),
       0, 118, 57, 173, 70, 163, 205, 95, 20, 199, 41, 4, 24, 14, 4, 98,
       315, 22, 227, 16, 310, 12, 95, 15, 24, 133, 59, 86, 70, 29, 26, 37,
       34, 45, 44, 53, 60, 52, 65, 54, 68, 53, 55, 26, 73, 18],
      dtype=object)

In [120]:
print (problem_set)

{'problem_104030:J', 'problem_1832:D2', 'problem_104114:A', 'problem_1729:D', 'problem_1148:B', 'problem_478:B', 'problem_215:A', 'problem_35:C', 'problem_61:A', 'problem_371:B', 'problem_1680:C', 'problem_1775:E', 'problem_1839:D', 'problem_198:A', 'problem_1679:B', 'problem_1670:E', 'problem_1872:F', 'problem_266:A', 'problem_1540:B', 'problem_1370:E', 'problem_1628:D1', 'problem_354:A', 'problem_1847:F', 'problem_1886:E', 'problem_1776:A', 'problem_152:C', 'problem_1665:C', 'problem_231:B', 'problem_1788:D', 'problem_1916:A', 'problem_1650:G', 'problem_1810:B', 'problem_103049:A', 'problem_1903:E', 'problem_104013:A', 'problem_460:B', 'problem_1592:E', 'problem_103388:N', 'problem_1721:C', 'problem_1684:E', 'problem_304:A', 'problem_1867:D', 'problem_1909:D', 'problem_463:C', 'problem_1634:A', 'problem_103640:J', 'problem_1842:E', 'problem_459:C', 'problem_104270:M', 'problem_1552:F', 'problem_1807:F', 'problem_1764:B', 'problem_1917:E', 'problem_216:C', 'problem_1719:B', 'problem_1

### Generating Recommendations

In [121]:
recos = '\n-'.join(generate_recommendation(input, problem_set))

print(f"Recomendations:\n-{recos}")

Recomendations:
-problem_104687:J
-problem_1812:F
-problem_999:F
-problem_1392:B
-problem_842:C
-problem_1243:B2
-problem_1169:A
-problem_717:C
-problem_1427:C
-problem_1428:E
