## Библиотеки 

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

## Функции

In [2]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [3]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def my_apk(actual, predicted, k=10):

    if not actual:
        return 0.0
    
    actual_copy = actual.copy()
    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual_copy:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            actual_copy.remove(p)

    return score / len(actual)

def my_mapk(actual, predicted, k=10):
    return np.mean([my_apk(a, p, k) for a, p in zip(actual, predicted)])

## Получение данных

In [4]:
df_train = pd.read_csv('df_train.csv', sep=';')
df_test = pd.read_csv('df_test.csv', sep=';')

In [5]:
df_train['Data'] = df_train.Data.apply(lambda s: list(map(int, s.split(','))))
df_train['Target'] = df_train.Target.apply(lambda s: list(map(int, s.split(','))))
df_test['Data'] = df_test.Data.apply(lambda s: list(map(int, s.split(','))))

## Baseline 1: топ10 MCC-кодов из train-части

MCC-код и соответствующее количество вхождений в train-часть

In [6]:
top10_codes = df_train['Data'].explode().value_counts().head(10)
top10_codes

Data
6011    700677
6010    490602
4814    473396
5411    472408
4829    307388
5499    164719
5541     68224
5912     65071
5331     61833
5812     52029
Name: count, dtype: int64

In [7]:
print(mapk(df_train['Target'], [top10_codes.index]*len(df_train)))
print(my_mapk(df_train['Target'], [top10_codes.index]*len(df_train)))

0.2742435829727881
0.2742435829727881


In [8]:
print(mapk(df_train['Target'], df_train['Target']))
print(my_mapk(df_train['Target'], df_train['Target']))

0.3754930723415012
1.0


In [9]:
print(mapk(df_train['Target'], df_train['Target'].apply(lambda row: list(set(row)))))
print(my_mapk(df_train['Target'], df_train['Target'].apply(lambda row: list(set(row)))))

0.4511446040096687
0.4511446040096687


## Baseline 2: cамые популярные транзакции пользователя.

**Если таких менее 10, то замешиваются топ10 популярных из всей выборки.**

In [10]:
def get_top_codes(transactions, top_n=10, drop_from=5):
    transactions_stats = sorted(
        Counter(transactions).items(), 
        key=lambda x: x[1], 
        reverse=True
    )[:top_n]
    

    top_codes = [mcc_code for (mcc_code, count) in transactions_stats if count >= drop_from]
    top_codes += list(top10_codes.index)

    return top_codes[:10]

In [11]:
df_train['pred_baseline_2'] = df_train['Data'].apply(get_top_codes)

In [12]:
print(mapk(df_train['Target'], df_train['pred_baseline_2']))
print(my_mapk(df_train['Target'], df_train['pred_baseline_2']))

0.3236094127683776
0.3893116295288199


In [13]:
print(mapk(df_train['Target'], df_train['pred_baseline_2']))
print(my_mapk(df_train['Target'], df_train['pred_baseline_2']))

0.3236094127683776
0.3893116295288199


предсказания из второго бейзлайна

In [14]:
# df_test['Predicted'] = df_test['Data'].apply(get_top_codes)

# submission_baseline_2 = df_test[['Id', 'Predicted']]
# submission_baseline_2['Predicted'] = submission_baseline_2['Predicted'].astype(str).str.replace(',', '')
# submission_baseline_2.to_csv('submission_baseline_2.csv', index=False)

In [15]:
# df_test['Predicted'] = (df_train['Data'] + df_train['Target'] + df_test['Data']).apply(get_top_codes)

# submission_baseline_2 = df_test[['Id', 'Predicted']]
# submission_baseline_2['Predicted'] = submission_baseline_2['Predicted'].astype(str).str.replace(',', '')
# submission_baseline_2.to_csv('submission_baseline_all_data.csv', index=False)

## Model 1: коды по вероятностям

In [16]:
train_proba = df_train['Data'].explode().value_counts(normalize=True)
test_proba = df_test['Data'].explode().value_counts(normalize=True)
train_test_proba = (df_train['Data'] + df_test['Data']).explode().value_counts(normalize=True)
train_target_proba = (df_train['Data'] + df_train['Target']).explode().value_counts(normalize=True)
train_test_target_proba = (df_train['Data'] + df_train['Target'] + df_test['Data']).explode().value_counts(normalize=True)

In [17]:
def tops_by_proba(seq, base_proba=train_proba, top_n=10, importance=1):
    counts = np.unique(seq, return_counts=True)
    proba = pd.Series(counts[1], index=counts[0]) / counts[1].sum() * importance
    proba = base_proba.add(proba, fill_value=0).sort_values(ascending=False)
    return list(proba.index)[:top_n]

In [18]:
df_train['Predictions'] = df_train['Data'].apply(tops_by_proba, base_proba=train_proba, top_n=184, importance=7)
print(mapk(df_train['Target'], df_train['Predictions']))

0.3331617781479149


In [19]:
def tops_by_weighted_proba(seq, base_proba=train_proba, top_n=10, importance=1):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    proba = pd.Series(index=np.unique(seq))
    
    for i in proba.index:
        proba[i] = sum(np.where(np.array(seq) == i)[0] + 1) / index_sum * importance

    proba = base_proba.add(proba, fill_value=0).sort_values(ascending=False)
    return list(proba.index)[:top_n]

In [20]:
df_train['Predictions'] = df_train['Data'].apply(tops_by_weighted_proba, base_proba=train_proba, top_n=10, importance=9)
print(mapk(df_train['Target'], df_train['Predictions']))

0.33713661107838555


In [21]:
def tops_by_normalized_proba(seq):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    probas = pd.Series(index=np.unique(seq))

    for i in probas.index:
        probas[i] = sum(np.where(np.array(seq) == i)[0] + 1) / index_sum

    seq_tops = list(probas.sort_values(ascending=False).index)
    other_tops = [x for x in top10_codes.index if x not in seq_tops]
    
    return seq_tops + other_tops

In [22]:
df_train['Predictions'] = df_train['Data'].apply(tops_by_normalized_proba)
print(mapk(df_train['Target'], df_train['Predictions']))

0.3358076776376222


In [23]:
def tops_by_normalized_proba_sliced(seq):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    probas = pd.Series(index=np.unique(seq))

    for i in probas.index:
        probas[i] = sum(np.where(np.array(seq) == i)[0] + 1) / index_sum
    
    sorted_codes = probas.sort_values(ascending=False).index
    
    seq_tops = [x for x in sorted_codes if probas[x] >= 0.1]
    other_tops = [x for x in top10_codes.index if x not in sorted_codes]
    seq_tail = [x for x in sorted_codes if probas[x] < 0.1]

    return seq_tops + other_tops + seq_tail

In [24]:
df_train['Predictions'] = df_train['Data'].apply(tops_by_normalized_proba_sliced)
print(mapk(df_train['Target'], df_train['Predictions']))

0.31054917971738677


In [25]:
def seq_by_weighted_proba(seq):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    proba = pd.Series(index=np.unique(seq))

    for i in proba.index:
        proba[i] = sum(np.where(np.array(seq) == i)[0] + 1) / index_sum

    output = []
    counts = round(proba.sort_values(ascending=False) * 10)

    for i in counts.index:
        output += [i] * int(counts[i])
        if counts[i] == 0:
            output += [i]
    
    return output

In [26]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_weighted_proba)
print(mapk(df_train['Target'], df_train['Predictions']))

0.20454976990559245


In [27]:
def seq_by_weighted_proba_choice(seq, size=10):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    unique_codes = np.unique(seq)
    probabilities = np.zeros_like(unique_codes, dtype='float')

    for i, code in enumerate(unique_codes):
        probabilities[i] = sum(np.where(np.array(seq) == code)[0] + 1) / index_sum

    return list(np.random.choice(unique_codes, size=size, p=probabilities))

In [28]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_weighted_proba_choice)
print(mapk(df_train['Target'], df_train['Predictions']))

0.1928600994405061


In [29]:
def seq_by_proba_circled(seq):
    proba = pd.Series(seq).value_counts(normalize=True)
    counts = round(proba.sort_values(ascending=False) * 10)
    output = []

    while counts.sum() != 0:
        for i in counts.index:
            if counts[i] != 0:
                output.append(i)
                counts[i] -= 1

    for i in counts.index:
        if i not in output:
            output.append(i)

    return output

In [30]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_proba_circled)
print(mapk(df_train['Target'], df_train['Predictions']))

0.29443364501590014


In [31]:
def seq_by_weighted_proba_circled(seq):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    proba = pd.Series(index=np.unique(seq))

    for i in proba.index:
        proba[i] = sum(np.where(np.array(seq) == i)[0] + 1) / index_sum

    output = []
    counts = round(proba.sort_values(ascending=False) * 10)

    while counts.sum() != 0:
        for i in counts.index:
            if counts[i] != 0:
                output.append(i)
                counts[i] -= 1

    for i in counts.index:
        if i not in output:
            output.append(i)

    return output

In [32]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_weighted_proba_circled)
print(mapk(df_train['Target'], df_train['Predictions']))

0.30046224262490434


In [33]:
def seq_by_weighted_proba_mixed(seq):
    seq_len = len(seq)
    index_sum = seq_len * (seq_len+1) / 2
    probas = pd.Series(index=np.unique(seq))

    for code in probas.index:
        probas[code] = sum(np.where(np.array(seq) == code)[0] + 1) / index_sum

    counts = round(probas.sort_values(ascending=False) * 10)
    outsider_codes = list(counts[counts == 0].index) + [x for x in top10_codes.index if x not in probas.index]
    output = [None]

    while counts.sum() != 0:
        for code in counts.index:
            if counts[code] != 0:
                if code != output[-1] or not outsider_codes:
                    output.append(code)
                    counts[code] -= 1
                else:
                    output.append(outsider_codes.pop(0))

    output += outsider_codes

    return output[1:]

In [34]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_weighted_proba_mixed)
print(mapk(df_train['Target'], df_train['Predictions']))

0.3042841682860167


In [35]:
def seq_by_weighted_proba_no_repeats(seq):
    seq_len = len(seq)
    pos_sum = seq_len * (seq_len+1) / 2
    probas = pd.Series(index=np.unique(seq))

    for code in probas.index:
        positions = np.where(np.array(seq) == code)[0] + 1
        probas[code] = sum(positions) / pos_sum

    counts = round(probas.sort_values(ascending=False) * 10)
    outsider_codes = list(counts[counts == 0].index)
    output = [None]

    while counts.sum() != 0:
        for code in counts.index:
            if counts[code] != 0:
                if code != output[-1] or not outsider_codes:
                    output.append(code)
                    counts[code] -= 1
                else:
                    output.append(outsider_codes.pop(0))

    output += outsider_codes
    output += [counts.index[0]] * (11 - len(output))

    return output[1:]

In [36]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_weighted_proba_no_repeats)
print(mapk(df_train['Target'], df_train['Predictions']))

0.3041419814525175


In [37]:
def seq_by_probas(seq):
    seq_len = len(seq)
    pos_sum = seq_len * (seq_len+1) / 2
    probas = {}

    for code in np.unique(seq):
        positions = np.where(np.array(seq) == code)[0] + 1
        probas[code] = sum(positions) / pos_sum

    output = []

    while any(val > 0 for val in probas.values()):
        next_code = max(probas, key=probas.get)
        output.append(next_code)
        probas[next_code] -= 0.1

    return output

In [38]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_probas)
print(mapk(df_train['Target'], df_train['Predictions']))

0.2395271215742565


In [39]:
def seq_by_probas_no_repeats(seq):
    seq_len = len(seq)
    pos_sum = seq_len * (seq_len+1) / 2
    probas = {}

    for code in np.unique(seq):
        positions = np.where(np.array(seq) == code)[0] + 1
        probas[code] = sum(positions) / pos_sum

    output = [None]

    while any(val > 0 for val in probas.values()):
        next_code = sorted(probas, key=probas.get)[-1]
        if next_code == output[-1]:
            try:
                next_code = sorted(probas, key=probas.get)[-2]
            except:
                pass
        output.append(next_code)
        probas[next_code] -= 0.1

    return output[1:]

In [40]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_probas_no_repeats)
print(mapk(df_train['Target'], df_train['Predictions']))

0.2660750735196207


In [41]:
total_proba = (df_train['Data'] + df_train['Target'] + df_test['Data']).explode().value_counts(ascending=True)
codes_dict = {code: i for i, code in enumerate(total_proba.index)}

In [42]:
def seq_by_probas_no_repeats_fixed(seq):
    seq_len = len(seq)
    pos_sum = seq_len * (seq_len+1) / 2
    probas = {}

    for code in np.unique(seq):
        positions = np.where(np.array(seq) == code)[0] + 1
        probas[code] = sum(positions) / pos_sum

    output = [None]

    while any(val > 0 for val in probas.values()):
        max_ = max(probas.values())
        next_code = [key for key, val in probas.items() if val == max_]
        next_code = max(next_code, key=codes_dict.get)
        
        if next_code == output[-1]:
            try:
                probas_copy = probas.copy()
                del probas_copy[next_code]
                max_ = max(probas_copy.values())
                next_code = [key for key, val in probas_copy.items() if val == max_]
                next_code = max(next_code, key=codes_dict.get)
            except:
                pass
        
        output.append(next_code)
        probas[next_code] -= 0.1

    return output[1:]

In [43]:
df_train['Predictions'] = df_train['Data'].apply(seq_by_probas_no_repeats_fixed)
print(mapk(df_train['Target'], df_train['Predictions']))

0.266087328670508


In [44]:
total_top_10 = (df_train['Data'] + df_train['Target'] + df_test['Data']).explode().value_counts(ascending=False).index[:10]
total_top_10

Index([6011, 6010, 4814, 5411, 4829, 5499, 5541, 5912, 5331, 5812], dtype='object')

In [45]:
def tops_by_proba_final(seq, cutoff=250, drop_from=3):
    seq = seq[-cutoff:]
    seq_len = len(seq)
    pos_sum = seq_len * (seq_len+1) / 2
    probas = {}

    for code in np.unique(seq):
        positions = np.where(np.array(seq) == code)[0] + 1
        if len(positions) >= drop_from:
            probas[code] = sum(positions) / pos_sum

    output = sorted(probas, key=probas.get, reverse=True)

    if len(output) < 10:
        output += [x for x in total_top_10 if x not in output]

    return output[:10]

In [46]:
df_train['Predictions'] = df_train['Data'].apply(tops_by_proba_final)
print(mapk(df_train['Target'], df_train['Predictions']))

0.3354597261436448


In [47]:
df_test['Predicted'] = df_test['Data'].apply(tops_by_proba_final)
min([len(x) for x in df_test['Predicted']]), max([len(x) for x in df_test['Predicted']])

(10, 10)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_proba, base_proba=test_proba, top_n=184)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_proba_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_proba, base_proba=test_proba, top_n=10)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_proba_2.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_proba, base_proba=test_proba, top_n=184, importance=7)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_proba_weighted_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_proba, base_proba=train_test_target_proba, top_n=184, importance=7)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_proba_weighted_2.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_weighted_proba, base_proba=test_proba, top_n=20, importance=9)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_proba_weighted_3.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_2.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_choice, size=10)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_choice_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_choice, size=100)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_choice_2.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_normalized_proba)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('tops_by_normalized_proba_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_normalized_proba_sliced)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_tops_by_normalized_proba_sliced_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_4.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_circled)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_circled_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_proba_circled)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_proba_circled_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_no_repeats)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_no_repeats_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_mixed)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_mixed_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_mixed)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_mixed_2.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_combined_proba)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_combined_proba_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_probas)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_probas_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_probas_no_repeats)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_probas_no_repeats_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_combined_probas_no_repeats)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_combined_probas_no_repeats_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_probas_no_repeats_fixed)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_probas_no_repeats_fixed_1.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_no_repeats)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_no_repeats_2.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_weighted_proba_no_repeats)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_weighted_proba_no_repeats_3.csv', index=False)

In [None]:
# df_test['Predicted'] = df_test['Data'].apply(seq_by_probas_no_repeats_fixed)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_seq_by_probas_no_repeats_fixed_2.csv', index=False)

In [56]:
# df_test['Predicted'] = df_test['Data'].apply(tops_by_proba_final)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_tops_by_proba_final_1.csv', index=False)

In [61]:
# df_test['Predicted'] = df_test['Data'].apply(get_top_codes_some)

# submission = df_test[['Id', 'Predicted']]
# submission['Predicted'] = submission['Predicted'].astype(str).str.replace(',', '')
# submission.to_csv('submission_get_top_codes_some_1.csv', index=False)