In [1]:
import pandas as pd
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/ma/ma_ma/ma_pbhattar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
sessions_test=pd.read_csv("../../data/test/sessions_test_task1_phase2.csv")
sessions_train=pd.read_csv("../../data/train/sessions_train.csv")

In [3]:
sessions_test.head(2)

Unnamed: 0,prev_items,locale
0,['B087VLP2RT' 'B09BRQSHYH' 'B099KW4ZLV'],DE
1,['B08XW4W667' 'B096VMCJYF' 'B096VMCJYF'],DE


In [4]:
def convert_to_text_list(df):
    texts = []
    for index, row in df.iterrows():
        text = str(row['prev_items'])[1:-1]
        text = text.replace("'", "")
        text = text.replace("\n", "")
        text = text + ' ' + str(row['next_item'])
        texts.append(text)
    return texts

def convert_to_token_list(df):
    tokens = []
    for index, row in df.iterrows():
        text = str(row['prev_items'])[1:-1]
        text = text.replace("'", "")
        text = text.replace("\n", "")
        tokens.extend(text.split())
        tokens.append(str(row['next_item']))
    return tokens

In [5]:
from collections import defaultdict

def create_ngram_model(texts, n):
    ngrams = defaultdict(lambda: defaultdict(int))

    # Tokenize the texts
    tokens = [word_tokenize(text) for text in texts]

    # Create n-grams
    for token_list in tokens:
        for i in range(len(token_list) - n):
            ngram = tuple(token_list[i:i + n])
            next_token = token_list[i + n]
            ngrams[ngram][next_token] += 1

    # Sort the next tokens by frequency
    for ngram, next_tokens in ngrams.items():
        ngrams[ngram] = dict(sorted(next_tokens.items(), key=lambda x: x[1], reverse=True)[:100])

    return ngrams

In [6]:
# Seperate the sessions data by locale
sessions_train_DE = sessions_train[sessions_train['locale'] == 'DE']
sessions_train_JP = sessions_train[sessions_train['locale'] == 'JP']
sessions_train_UK = sessions_train[sessions_train['locale'] == 'UK']

In [7]:
# Generate n-gram models for each locale
onegram_DE = create_ngram_model(convert_to_text_list(sessions_train_DE), 1)

In [8]:
%%timeit
onegram_JP = create_ngram_model(convert_to_text_list(sessions_train_JP), 1)

1min 44s ± 1.86 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
onegram_UK = create_ngram_model(convert_to_text_list(sessions_train_UK), 1)

1min 55s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# Combine ngram models and all_grams into dictionaries for easy access
ngram_models = {'DE': onegram_DE, 'JP': onegram_JP, 'UK': onegram_UK}

In [None]:
%%timeit
all_grams_dict = {'DE': Counter(convert_to_token_list(sessions_train_DE)),
                  'JP': Counter(convert_to_token_list(sessions_train_JP)),
                  'UK': Counter(convert_to_token_list(sessions_train_UK))}

In [None]:
# Find the top 100 items for each locale
for locale in all_grams_dict:
    all_grams_dict[locale] = all_grams_dict[locale].most_common(100)

In [None]:
# Make predictions based on locale
for index, row in sessions_test.iterrows():
    predictions = []
    text = str(row['prev_items'])[1:-1]
    text = text.replace("'", "")
    text = text.replace("\n", "")
    tokens = word_tokenize(text)
    gram = [tokens[-1]]
    gram = tuple(gram)
    locale = row['locale']
    i = 1
    for k, v in ngram_models[locale][gram].items():
        predictions.append(k)
        if i == 100:
            break
        i += 1
    if i < 100:
        for k, v in all_grams_dict[locale]:
            predictions.append(k)
            if i == 100:
                break
            i += 1
    sessions_test.at[index, 'next_item_prediction'] = str(predictions)

In [None]:
sessions_test.head(2)

In [None]:
sessions_test.iloc[0].next_item_prediction

In [None]:
for index, row in sessions_test.iterrows():
    sessions_test.at[index, 'next_item_prediction']=str(row['next_item_prediction'])[1:-1]

In [None]:
sessions_test.iloc[0].next_item_prediction

In [None]:
for index, row in sessions_test.iterrows():
    sessions_test.at[index, 'next_item_prediction']=row['next_item_prediction'].replace("'","")

In [None]:
sessions_test.iloc[0].next_item_prediction

In [None]:
sessions_test.drop('prev_items', axis=1, inplace=True)

In [None]:
# Reorganize next_item_prediction column to match submission format
for index, row in sessions_test.iterrows():
    sessions_test.at[index, 'next_item_prediction']=row['next_item_prediction'].split(', ')

In [None]:
sessions_test.head(2)

In [None]:
sessions_test.iloc[0].next_item_prediction

In [None]:
output_path = '../../outputs/' + 'task1_predictions.parquet'

import pyarrow.parquet as pq
import pyarrow as pa

# Save predictions to parquet
table = pa.Table.from_pandas(sessions_test)
pq.write_table(table, output_path, compression='gzip')