In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
# Load Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,Input,BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint

# pd.options.display.max_rows
# pd.set_option('display.max_colwidth', -1)

In [71]:
#Load train and test dataset
train = pd.read_csv('/content/drive/My Drive/ml/recommendation/train/train.csv')
test = pd.read_csv('/content/drive/My Drive/ml/recommendation/test/test.csv')
challenges = pd.read_csv('/content/drive/My Drive/ml/recommendation/train/challenge_data.csv')
#train[:33]

In [72]:
train.shape

(903916, 4)

In [73]:
# how many unique users
train.user_id.nunique()

69532

In [74]:
# how many unique  challenges
train.challenge.nunique()

5348

In [75]:
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [76]:
def apk(actual, predicted, k=3):
  
    actual = list(actual)
    predicted = list(predicted)
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):

    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [77]:
challenges.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [78]:
# Create labels
label = train[train.challenge_sequence > 10][['user_id','challenge']]
label.rename(columns={'challenge':'label'},inplace=True)
label.head()

Unnamed: 0,user_id,label
10,4576,CI24958
11,4576,CI23667
12,4576,CI23691
23,4580,CI24915
24,4580,CI25727


In [79]:
# Treat the sequence of challenges as text
df = train[train.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()


In [80]:
# Merge Labels
df = df.merge(label)
df.head()

Unnamed: 0,user_id,challenge,label
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI24958
1,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23667
2,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23691
3,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI24915
4,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI25727


In [81]:
# Validation split for early stopping
df_train, df_validation = train_test_split(df.sample(frac=1,random_state=42), test_size=0.001, random_state=42)

In [82]:
# Encode challenges
encoder = LabelEncoder()
encoder.fit(challenges['challenge_ID'])
df_train['id_encoded'] = encoder.transform(df_train.label)
df_validation['id_encoded'] = encoder.transform(df_validation.label)

In [83]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['challenge'])
df_train.head()

Unnamed: 0,user_id,challenge,label,id_encoded
156816,86823,CI23922 CI24230 CI24917 CI23933 CI23769 CI2495...,CI26076,2598
167,4675,CI23663 CI23933 CI24915 CI24958 CI24944 CI2605...,CI24527,1049
7579,8561,CI27785 CI27786 CI27788 CI27789 CI27790 CI2779...,CI27797,4319
203862,111361,CI26159 CI26210 CI26248 CI26216 CI26217 CI2621...,CI26212,2734
190895,104618,CI26164 CI26904 CI26926 CI26930 CI26931 CI2692...,CI26856,3378


In [84]:
# Constants
NB_WORDS = len(tokenizer.word_index)
MAX_SEQUENCE_LENGTH = 10
N_CATEGORIES = challenges.shape[0]

In [85]:
# Create sequences
sequences_train = tokenizer.texts_to_sequences(df_train['challenge'])
sequences_validation = tokenizer.texts_to_sequences(df_validation['challenge'])

In [86]:
# Pad sequences
x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_validation = pad_sequences(sequences_validation, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_train[0]

array([179,  58,   2,   6,  13,   5,  59, 310, 377, 781], dtype=int32)

In [87]:
# Set Labels
y_train = df_train['id_encoded'].values
y_validation= df_validation['id_encoded'].values

In [88]:
# NN architecture
def get_model(path='',lr=0.001):
    adam = Adam(lr=lr)
    inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
    x = Embedding(NB_WORDS,512)(inp)
    x = BatchNormalization()(x)
    x = Bidirectional(LSTM(512, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Dropout(0.4)(x)
    x = Dense(N_CATEGORIES, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    if path != '':
        model.load_weights(path)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [89]:
# Initialize the model
model = get_model()

In [90]:
# Model callbacks
path = 'best_model_weights'
es_callback = EarlyStopping(monitor="val_loss", patience=4)
mc_callback = ModelCheckpoint('{}.hdf5'.format(path), monitor='val_loss',
                              verbose=2, save_best_only=True, save_weights_only=True,
                              mode='auto', period=1)
callbacks = [es_callback,mc_callback]

In [91]:
# Fit the model
model.fit(x_train,
          y_train,
          epochs=100,
          batch_size=1024,
          validation_data=(x_validation, y_validation),
          callbacks = callbacks
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 208387 samples, validate on 209 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 5.52185, saving model to best_model_weights.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 5.52185 to 4.49079, saving model to best_model_weights.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 4.49079 to 4.11539, saving model to best_model_weights.hdf5
Epoch 4/100

Epoch 00004: val_loss did not improve from 4.11539
Epoch 5/100

Epoch 00005: val_loss did not improve from 4.11539
Epoch 6/100

Epoch 00006: val_loss did not improve from 4.11539
Epoch 7/100

Epoch 00007: val_loss did not improve from 4.11539


<keras.callbacks.callbacks.History at 0x7f0239edc940>

In [92]:
# Load best weights
model = get_model('{}.hdf5'.format(path))

In [93]:
# Test preprocessing
def padding(text):
	return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
x_test = padding(test_text.challenge)
x_test[0]

array([  1,   6,   2,  14,   3,   4,   5,   8, 100,   7], dtype=int32)

In [94]:
# Get top 3 predictions for each user
pred = model.predict(x_test,batch_size=2048)
pred[0][:22]

array([2.6344855e-08, 9.2485898e-05, 2.8877943e-07, 3.7092707e-07,
       7.8495354e-08, 3.2324689e-08, 4.8623592e-06, 1.6414221e-07,
       2.2800767e-08, 1.4083026e-08, 2.1385722e-08, 1.4037237e-06,
       8.6258801e-08, 6.9572308e-09, 4.3195691e-08, 1.5165262e-05,
       1.3501943e-07, 1.4278578e-08, 2.1072972e-06, 1.7688754e-08,
       8.6547861e-06, 5.0593240e-08], dtype=float32)

In [95]:
#pred = pred.argsort(axis=1)[:,-3:][:,::-1]
pred = pred.argsort(axis=1)[:,-3:][:,::-1]
pred[:12]

array([[1475,  370,  497],
       [1052, 1648,  358],
       [3472, 3471, 3473],
       [1480,  213, 1049],
       [1480, 1664,  185],
       [ 370, 1480, 1439],
       [ 370,  660,  213],
       [2689, 2685, 2687],
       [1052, 1475, 1466],
       [2685, 2687, 2691],
       [2689, 2685, 2690],
       [1439, 1480,  213]])

In [96]:
# Write Predictions
df_list = []
for i in range(3):
	test_11 = test_text[['user_id']]
	test_11['user_sequence'] = test_11.user_id.astype(str) + '_'+str(i+11)
	test_11['challenge'] = encoder.inverse_transform(pred[:,i])
	df_list.append(test_11[['user_sequence','challenge']])
pd.concat(df_list).to_csv('RNN_submission.csv',index=False)


In [97]:
df5 = pd.read_csv('/content/RNN_submission.csv')
df5

Unnamed: 0,user_sequence,challenge
0,4577_11,CI24953
1,4578_11,CI24530
2,4579_11,CI26950
3,4583_11,CI24958
4,4584_11,CI24958
...,...,...
119191,113829_13,CI24362
119192,113830_13,CI24866
119193,113831_13,CI24931
119194,113834_13,CI25139
