# The Approach

One way to approach this problem is to treat it as a sequence prediction problem for NLP, where you need to predict the next word given the previous words.

Now there may be many ways to predict using this hypothesis . for example calculating conditional probabilities.

I have used here a simple method of co-occurence matrix which essentially will be used to predict the next challenge.

I will be using both train and test data to calculate the co-occurence matrix

In [1]:
# import the required libraries

from collections import Counter
from scipy import sparse
import numpy as np
import pandas as pd

In [2]:
# reading train and test file

train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [3]:
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


### Creating dataset in the required form for Co-occurence matrix

In [4]:
# convert the train in the long format to wide format

wide_train = train.pivot_table(index = "user_id", columns="challenge_sequence", values="challenge", aggfunc= lambda x : x).reset_index()

In [5]:
# dropping the user_id, since we won't be needing those for our co-occurrence matrix

wide_train.drop(["user_id"], axis =1, inplace = True)

In [6]:
# convert each row for a user into a string

rows = []
for index, row in wide_train.iterrows():
    r = " ".join(row.map(str))
    rows.append(r)

In [7]:
# converting test to wide format

wide_test = test.pivot_table(index = "user_id", columns="challenge_sequence", values="challenge", aggfunc= lambda x : x).reset_index()

In [8]:
# saving test user_id for future use

test_ids = wide_test['user_id']

In [9]:
# dropping user_id from wide test

wide_test.drop(["user_id"], axis =1, inplace = True)

In [10]:
# appending the test strings into the train strings (vertically)

for index, row in wide_test.iterrows():
    r = " ".join(row.map(str))
    rows.append(r)

In [11]:
# creating a corpus

thefile = open("corpus.txt","w")

In [12]:
for element in rows:
    thefile.write("%s\n"%element)

In [13]:
thefile.close()

### Creating co-occurence matrix from the corpus

In [14]:
# reading the corpus

corpus = open("corpus.txt","r")

In [15]:
# creating a dictionary with key = challenge_name and value = frequency

vocab = Counter()

In [16]:
# updating the vocab dictionary with each line in the corpus

for line in corpus:
    tokens = line.strip().split()
    vocab.update(tokens)

In [17]:
# modifying the vocab dictionary to begin creating a mapping of challenge_id to integers

vocab = {word: (i, freq) for i, (word, freq) in enumerate(vocab.items())}

In [18]:
# creating a reverse mapping from integer to challenge_id to decode the predictions made.

id2word = dict((i, word) for word, (i, _) in enumerate(vocab.items()))

In [19]:
vocab_size = len(vocab)

In [20]:
# creating a square co-occurence matrix

cooccurrences = sparse.lil_matrix((vocab_size, vocab_size),dtype=np.float64)

In [21]:
# context window size

window_size = 3

In [22]:
corpus = open("corpus.txt","r")

In [23]:
# This is where the actual magic is.

# Tuneable parameters : window_size, distance


for i, line in enumerate(corpus):
    tokens = line.strip().split()
    token_ids = [vocab[word][0] for word in tokens]
    
    for center_i, center_id in enumerate(token_ids):
        # Collect all word IDs in left window of center word
        context_ids = token_ids[max(0, center_i - window_size) : center_i]
        contexts_len = len(context_ids)

#         print("context_ids = %s" %context_ids)
#         print("contexts_len = %s" %contexts_len)

        for left_i, left_id in enumerate(context_ids):
            # Distance from center word

#             print("left_i = %s" %left_i)
#             print("left_id = %s" %left_id)

            distance = contexts_len - left_i

#             print("distance = %s"  %distance)
#             print("-------------")

            # Weight by inverse of distance between words
            increment = 1.0 / float(distance)

            # Build co-occurrence matrix symmetrically (pretend we
            # are calculating right contexts as well)
            cooccurrences[center_id, left_id] += increment
            cooccurrences[left_id, center_id] += increment

In [24]:
# If anything other than None will exclude challenges whose frequencies are below this value.

min_count = None

In [25]:
# filling the values in a matrix form

co_matrix = np.zeros([len(id2word),len(id2word)])

for i, (row, data) in enumerate(zip(cooccurrences.rows,cooccurrences.data)):
    if min_count is not None and vocab[id2word[i]][0] < min_count:
        continue
        
    for data_idx, j in enumerate(row):
        if min_count is not None and vocab[id2word[j]][0] < min_count:
            continue
            
        #print(i, j, data[data_idx])
        co_matrix[i,j] = data[data_idx]

In [26]:
# have a peek at the co-occurence matrix

co_matrix

array([[  0.        , 902.5       , 754.16666667, ...,   0.        ,
          0.        ,   0.        ],
       [902.5       ,   0.        , 962.66666667, ...,   0.        ,
          0.        ,   0.        ],
       [754.16666667, 962.66666667,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ]])

In [27]:
#saving the mapping to a dictionary

import pickle

pickle_path = "./vocab_mapping.pkl"
pickle_mapping = open(pickle_path,"wb")
pickle.dump(id2word, pickle_mapping)
pickle_mapping.close()

In [28]:
# saving the co-occurence matrix as a dataframe

co_occurence_dataframe = pd.DataFrame(co_matrix)

In [29]:
res = {v:k for k,v in id2word.items()}

In [30]:
co_occurence_dataframe =co_occurence_dataframe.rename(columns=res)

In [31]:
co_occurence_dataframe = co_occurence_dataframe.rename(index=res)

In [32]:
co_occurence_dataframe.to_csv("co_matrix_with_window_size_3.csv", index = False)

In [33]:
wide_test.head()

challenge_sequence,1,2,3,4,5,6,7,8,9,10
0,CI23855,CI23933,CI24917,CI24915,CI23714,CI23663,CI24958,CI25135,CI25727,CI24530
1,CI23663,CI23855,CI24917,CI23933,CI23975,CI23714,CI25135,CI24915,CI24958,CI23781
2,CI26939,CI26940,CI26941,CI26942,CI26943,CI26944,CI26945,CI26947,CI26948,CI26954
3,CI23663,CI23855,CI23975,CI23714,CI23848,CI23933,CI25135,CI23781,CI24530,CI23667
4,CI23855,CI23975,CI25135,CI23848,CI23714,CI24917,CI23929,CI25733,CI25126,CI23913


In [34]:
wide_test.shape

(39732, 10)

In [35]:
# making predictions with the co-occurence_matrix based on last attemped/predicted
final_predictions = []

for i in range(0,39732):
    predictions = [wide_test.loc[i,10]]
    counter = 0
    for stimulus in predictions:
        #print(co_occurence_dataframe.loc[co_occurence_dataframe[stimulus]== co_occurence_dataframe[stimulus].max(),stimulus])
        predictions.append(co_occurence_dataframe[stimulus].idxmax())
        counter+=1
        if counter == 3:
            break
            
    final_predictions.append(predictions[1:])
    

In [36]:
# making predictions with the co-occurence_matrix based on 10th challenge only
final_predictions_new = []

for i in range(0,39732):
    stimulus = wide_test.loc[i,10]
    #print(co_occurence_dataframe.loc[co_occurence_dataframe[stimulus]== co_occurence_dataframe[stimulus].max(),stimulus])
    final_predictions_new.append(list(co_occurence_dataframe[stimulus].nlargest(3).index))   

In [37]:
largest_3 = pd.DataFrame(final_predictions_new)

In [38]:
largest_3['user_id'] = test_ids

In [39]:
largest_3_long = pd.melt(largest_3,id_vars="user_id",var_name="sequence", value_name="challenge" )

In [40]:
largest_3_long['sequence'] = largest_3_long['sequence'].map({0:'11',1:'12',2:"13"})

In [41]:
largest_3_long['user_sequence'] = largest_3_long['user_id'].map(str)+"_"+largest_3_long['sequence'].map(str)

In [42]:
largest_3_long[['user_sequence','challenge']].to_csv("rnn-co-occurence.csv", index = False)

### Try Experimenting