In [1]:
# import the required libraries

from collections import Counter
from scipy import sparse
import numpy as np
import pandas as pd
import pickle

train = pd.read_csv('/content/drive/My Drive/ml/recommendation/train/train.csv')
test = pd.read_csv('/content/drive/My Drive/ml/recommendation/test/test.csv')
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [2]:
# convert the train in the long format to wide format

wide_train = train.pivot_table(index = "user_id", columns="challenge_sequence", values="challenge", aggfunc= lambda x : x).reset_index()
wide_train.head(20)

challenge_sequence,user_id,1,2,3,4,5,6,7,8,9,10,11,12,13
0,4576,CI23714,CI23855,CI24917,CI23663,CI23933,CI25135,CI23975,CI25126,CI24915,CI24957,CI24958,CI23667,CI23691
1,4580,CI23663,CI23855,CI23933,CI23975,CI24530,CI23714,CI23648,CI23781,CI23667,CI25135,CI24915,CI25727,CI26051
2,4581,CI26155,CI26156,CI26157,CI26158,CI26159,CI26160,CI26161,CI26162,CI26164,CI26165,CI26163,CI26166,CI26167
3,4582,CI23855,CI24915,CI24917,CI23933,CI23663,CI24958,CI23975,CI23714,CI24953,CI24944,CI25135,CI26051,CI24957
4,4585,CI23855,CI23975,CI24917,CI25135,CI23848,CI23714,CI23663,CI23933,CI24958,CI24915,CI24530,CI24187,CI25126
5,4587,CI23933,CI25727,CI26051,CI25125,CI25124,CI25633,CI23663,CI26050,CI23667,CI24915,CI24031,CI23855,CI28240
6,4590,CI23848,CI23855,CI23975,CI25135,CI23929,CI23714,CI23913,CI23663,CI25298,CI24917,CI23691,CI25733,CI25142
7,4591,CI23855,CI23933,CI24530,CI23663,CI23714,CI24534,CI24915,CI24917,CI25135,CI24527,CI24958,CI24261,CI23648
8,4592,CI23855,CI24917,CI25135,CI23848,CI23714,CI23975,CI23663,CI25142,CI23913,CI25126,CI23769,CI24958,CI24187
9,4593,CI26155,CI26157,CI26158,CI26159,CI26160,CI26161,CI26162,CI26164,CI26165,CI26163,CI26166,CI26167,CI26168


In [3]:
# dropping the user_id
wide_train.drop(["user_id"], axis =1, inplace = True)

# convert each row for a user into a string
rows = []
for index, row in wide_train.iterrows():
    r = " ".join(row.map(str))
    rows.append(r)

# converting test to wide format

wide_test = test.pivot_table(index = "user_id", columns="challenge_sequence", values="challenge", aggfunc= lambda x : x).reset_index()

wide_test.shape

(39732, 11)

In [4]:
test_ids = wide_test['user_id']
wide_test.drop(["user_id"], axis =1, inplace = True)

for index, row in wide_test.iterrows():
    r = " ".join(row.map(str))
    rows.append(r)

len(rows)

109264

In [5]:
# creating a corpus
thefile = open("corpus.txt","w")

for element in rows:
    thefile.write("%s\n"%element)

thefile.close()



corpus = open("corpus.txt","r")
vocab = Counter()

# updating the vocab dictionary with each line in the corpus
for line in corpus:
    tokens = line.strip().split()
    vocab.update(tokens)

vocab = {word: (i, freq) for i, (word, freq) in enumerate(vocab.items())}
vocab

{'CI23714': (0, 12458),
 'CI23855': (1, 15184),
 'CI24917': (2, 12372),
 'CI23663': (3, 12077),
 'CI23933': (4, 11060),
 'CI25135': (5, 10482),
 'CI23975': (6, 7075),
 'CI25126': (7, 8550),
 'CI24915': (8, 7845),
 'CI24957': (9, 4873),
 'CI24958': (10, 11333),
 'CI23667': (11, 4723),
 'CI23691': (12, 9532),
 'CI24530': (13, 10166),
 'CI23648': (14, 6816),
 'CI23781': (15, 2984),
 'CI25727': (16, 2726),
 'CI26051': (17, 5030),
 'CI26155': (18, 1470),
 'CI26156': (19, 1218),
 'CI26157': (20, 1350),
 'CI26158': (21, 1283),
 'CI26159': (22, 1289),
 'CI26160': (23, 1133),
 'CI26161': (24, 1091),
 'CI26162': (25, 1459),
 'CI26164': (26, 1744),
 'CI26165': (27, 1537),
 'CI26163': (28, 949),
 'CI26166': (29, 1329),
 'CI26167': (30, 1673),
 'CI24953': (31, 5385),
 'CI24944': (32, 2525),
 'CI23848': (33, 9090),
 'CI24187': (34, 6697),
 'CI25125': (35, 4526),
 'CI25124': (36, 6018),
 'CI25633': (37, 620),
 'CI26050': (38, 3021),
 'CI24031': (39, 3708),
 'CI28240': (40, 232),
 'CI23929': (41, 2033

In [6]:
id2word = dict((i, word) for word, (i, _) in enumerate(vocab.items()))

vocab_size = len(vocab)
print(vocab_size)

cooccurrences = sparse.lil_matrix((vocab_size, vocab_size),dtype=np.float64)
cooccurrences

window_size = 10
corpus = open("corpus.txt","r")

for i, line in enumerate(corpus):
    tokens = line.strip().split()
    token_ids = [vocab[word][0] for word in tokens]
    
    for center_i, center_id in enumerate(token_ids):
        context_ids = token_ids[max(0, center_i - window_size) : center_i]
        contexts_len = len(context_ids)

        for left_i, left_id in enumerate(context_ids):
            
            distance = contexts_len - left_i

            increment = 1.0 / float(distance)

            # Build co-occurrence matrix symmetrically 
            cooccurrences[center_id, left_id] += increment
            cooccurrences[left_id, center_id] += increment

min_count = None
#min_count = 20
print(min_count)

5502
None


In [7]:
# filling the values in a matrix form

co_matrix = np.zeros([len(id2word),len(id2word)])

for i, (row, data) in enumerate(zip(cooccurrences.rows,cooccurrences.data)):
    if min_count is not None and vocab[id2word[i]][0] < min_count:
        continue
        
    for data_idx, j in enumerate(row):
        if min_count is not None and vocab[id2word[j]][0] < min_count:
            continue
            
        co_matrix[i,j] = data[data_idx]

co_matrix

array([[   0.        , 1218.48412698,  952.68690476, ...,    0.        ,
           0.        ,    0.        ],
       [1218.48412698,    0.        , 1221.0265873 , ...,    0.        ,
           0.        ,    0.        ],
       [ 952.68690476, 1221.0265873 ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ]])

In [8]:
#saving the mapping to a dictionary
pickle_path = "./vocab_mapping.pkl"
pickle_mapping = open(pickle_path,"wb")
pickle.dump(id2word, pickle_mapping)
pickle_mapping.close()

# saving the co-occurence matrix as a dataframe

co_occurence_dataframe = pd.DataFrame(co_matrix)
co_occurence_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5462,5463,5464,5465,5466,5467,5468,5469,5470,5471,5472,5473,5474,5475,5476,5477,5478,5479,5480,5481,5482,5483,5484,5485,5486,5487,5488,5489,5490,5491,5492,5493,5494,5495,5496,5497,5498,5499,5500,5501
0,0.0,1218.484127,952.686905,1065.586905,985.600397,800.447222,690.27619,986.594841,522.625,260.590476,744.533333,394.339683,787.655556,877.035317,624.595238,273.655952,230.680952,487.644444,15.527381,6.20119,6.792857,3.242857,2.97619,5.756746,2.291667,1.0,0.0,9.265079,0.25,1.727381,0.792857,401.25754,124.84881,703.580556,514.392063,258.85754,400.038492,23.013095,313.365873,231.512698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1218.484127,0.0,1221.026587,1252.965476,1092.545238,968.419048,926.743651,641.663095,721.213492,335.470635,803.639286,350.634127,658.582937,847.08254,472.274206,241.72619,273.223016,443.536111,14.669444,2.658333,3.861111,2.504762,1.267857,6.7,1.144444,1.486111,1.2,8.73373,1.0,0.342857,0.658333,566.789683,267.097222,810.550397,530.228175,328.387302,417.200794,50.606349,296.967063,251.142857,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,952.686905,1221.026587,0.0,847.772222,969.061508,647.253571,509.603175,471.850397,797.802381,520.581746,967.759921,235.099206,455.787698,461.848016,358.738492,154.481349,195.853175,349.623016,70.642857,34.486508,40.936905,40.346429,24.45754,42.378175,22.921825,18.973016,39.631349,47.991667,11.981349,24.329762,22.281746,429.269444,206.519444,431.660317,357.785317,264.900397,316.435317,47.088492,243.286508,158.762698,...,0.361111,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.5,0.0,0.0,0.0,0.0
3,1065.586905,1252.965476,847.772222,0.0,976.407143,746.009921,705.35119,529.753571,646.126984,274.274603,740.08373,444.856746,762.122619,861.539286,546.593651,330.334524,290.338889,490.774603,6.131746,4.936111,3.227778,1.95,0.555556,2.416667,2.361111,0.25,0.5,5.730159,0.0,0.0,1.72619,434.575794,209.408333,774.489683,489.319048,308.69881,395.500794,59.05754,309.876587,312.225397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,985.600397,1092.545238,969.061508,976.407143,0.0,538.253175,487.259127,520.790476,588.089683,348.027778,741.419444,414.986508,562.581349,824.380159,622.425,259.582937,439.70754,532.790476,8.336905,3.707937,3.016667,3.47619,1.575,1.217857,1.833333,1.333333,0.458333,3.358333,0.0,0.5,0.7,296.609524,176.876587,512.162698,305.437302,303.196429,418.156746,96.243254,299.257143,286.347222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:

res = {v:k for k,v in id2word.items()}

co_occurence_dataframe =co_occurence_dataframe.rename(columns=res)
co_occurence_dataframe = co_occurence_dataframe.rename(index=res)

co_occurence_dataframe.to_csv("co_matrix_with_window_size_1.csv", index = False)
co_occurence_dataframe.head()

Unnamed: 0,CI23714,CI23855,CI24917,CI23663,CI23933,CI25135,CI23975,CI25126,CI24915,CI24957,CI24958,CI23667,CI23691,CI24530,CI23648,CI23781,CI25727,CI26051,CI26155,CI26156,CI26157,CI26158,CI26159,CI26160,CI26161,CI26162,CI26164,CI26165,CI26163,CI26166,CI26167,CI24953,CI24944,CI23848,CI24187,CI25125,CI25124,CI25633,CI26050,CI24031,...,CI27333,CI27291,CI27828,CI28773,CI24366,CI28886,CI29038,CI27815,CI27295,CI26316,CI27297,CI28994,CI26789,CI24327,CI24706,CI24472,CI27324,CI26003,CI26383,CI23794,CI27327,CI27359,CI27430,CI27329,CI27332,CI27360,CI27694,CI27084,CI28430,CI27325,CI27326,CI29005,CI25760,CI28335,CI25962,CI25968,CI27314,CI27334,CI25342,CI28218
CI23714,0.0,1218.484127,952.686905,1065.586905,985.600397,800.447222,690.27619,986.594841,522.625,260.590476,744.533333,394.339683,787.655556,877.035317,624.595238,273.655952,230.680952,487.644444,15.527381,6.20119,6.792857,3.242857,2.97619,5.756746,2.291667,1.0,0.0,9.265079,0.25,1.727381,0.792857,401.25754,124.84881,703.580556,514.392063,258.85754,400.038492,23.013095,313.365873,231.512698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23855,1218.484127,0.0,1221.026587,1252.965476,1092.545238,968.419048,926.743651,641.663095,721.213492,335.470635,803.639286,350.634127,658.582937,847.08254,472.274206,241.72619,273.223016,443.536111,14.669444,2.658333,3.861111,2.504762,1.267857,6.7,1.144444,1.486111,1.2,8.73373,1.0,0.342857,0.658333,566.789683,267.097222,810.550397,530.228175,328.387302,417.200794,50.606349,296.967063,251.142857,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI24917,952.686905,1221.026587,0.0,847.772222,969.061508,647.253571,509.603175,471.850397,797.802381,520.581746,967.759921,235.099206,455.787698,461.848016,358.738492,154.481349,195.853175,349.623016,70.642857,34.486508,40.936905,40.346429,24.45754,42.378175,22.921825,18.973016,39.631349,47.991667,11.981349,24.329762,22.281746,429.269444,206.519444,431.660317,357.785317,264.900397,316.435317,47.088492,243.286508,158.762698,...,0.361111,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.5,0.0,0.0,0.0,0.0
CI23663,1065.586905,1252.965476,847.772222,0.0,976.407143,746.009921,705.35119,529.753571,646.126984,274.274603,740.08373,444.856746,762.122619,861.539286,546.593651,330.334524,290.338889,490.774603,6.131746,4.936111,3.227778,1.95,0.555556,2.416667,2.361111,0.25,0.5,5.730159,0.0,0.0,1.72619,434.575794,209.408333,774.489683,489.319048,308.69881,395.500794,59.05754,309.876587,312.225397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CI23933,985.600397,1092.545238,969.061508,976.407143,0.0,538.253175,487.259127,520.790476,588.089683,348.027778,741.419444,414.986508,562.581349,824.380159,622.425,259.582937,439.70754,532.790476,8.336905,3.707937,3.016667,3.47619,1.575,1.217857,1.833333,1.333333,0.458333,3.358333,0.0,0.5,0.7,296.609524,176.876587,512.162698,305.437302,303.196429,418.156746,96.243254,299.257143,286.347222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
wide_test.head()

challenge_sequence,1,2,3,4,5,6,7,8,9,10
0,CI23855,CI23933,CI24917,CI24915,CI23714,CI23663,CI24958,CI25135,CI25727,CI24530
1,CI23663,CI23855,CI24917,CI23933,CI23975,CI23714,CI25135,CI24915,CI24958,CI23781
2,CI26939,CI26940,CI26941,CI26942,CI26943,CI26944,CI26945,CI26947,CI26948,CI26954
3,CI23663,CI23855,CI23975,CI23714,CI23848,CI23933,CI25135,CI23781,CI24530,CI23667
4,CI23855,CI23975,CI25135,CI23848,CI23714,CI24917,CI23929,CI25733,CI25126,CI23913


In [11]:
wide_test.shape

(39732, 10)

In [12]:

final_predictions = []

for i in range(0,39732):
    predictions = [wide_test.loc[i,10]]
    counter = 0
    for stimulus in predictions:
        predictions.append(co_occurence_dataframe[stimulus].idxmax())
        counter+=1
        if counter == 3:
            break
            
    final_predictions.append(predictions[1:])

# making predictions with the co-occurence_matrix based on 10th challenge only
final_predictions_new = []

for i in range(0,39732):
    stimulus = wide_test.loc[i,10]
    
    final_predictions_new.append(list(co_occurence_dataframe[stimulus].nlargest(3).index))

largest_3 = pd.DataFrame(final_predictions_new)
largest_3['user_id'] = test_ids
largest_3.head()

Unnamed: 0,0,1,2,user_id
0,CI23691,CI23714,CI23663,4577
1,CI23663,CI23714,CI23933,4578
2,CI26953,CI26955,CI26951,4579
3,CI23648,CI23663,CI23933,4583
4,CI23714,CI23855,CI25142,4584


In [13]:
largest_3_long = pd.melt(largest_3,id_vars="user_id",var_name="sequence", value_name="challenge" )
largest_3_long.head()

Unnamed: 0,user_id,sequence,challenge
0,4577,0,CI23691
1,4578,0,CI23663
2,4579,0,CI26953
3,4583,0,CI23648
4,4584,0,CI23714


In [14]:

largest_3_long['sequence'] = largest_3_long['sequence'].map({0:'11',1:'12',2:"13"})

largest_3_long['user_sequence'] = largest_3_long['user_id'].map(str)+"_"+largest_3_long['sequence'].map(str)

In [15]:
largest_3_long[['user_sequence','challenge']].to_csv("submission.csv", index = False)

In [16]:
df5 = pd.read_csv('/content/submission.csv')
df5

Unnamed: 0,user_sequence,challenge
0,4577_11,CI23691
1,4578_11,CI23663
2,4579_11,CI26953
3,4583_11,CI23648
4,4584_11,CI23714
...,...,...
119191,113829_13,CI24421
119192,113830_13,CI23691
119193,113831_13,CI24968
119194,113834_13,CI23855
