In [None]:
from abc import ABCMeta, abstractmethod
from sklearn.neighbors import NearestNeighbors


class ModelInterface:

    """Interface for model construction"""

    __metaclass__ = ABCMeta

    def __init__(self, data, metrics, n_neighbours, data_processor):
        self.data = data
        self.metrics = metrics
        self.n_neighbours = n_neighbours
        self.data_processor = data_processor
        self.users_idx = []
        self.model = NearestNeighbors(metric=self.metrics, algorithm='brute', n_neighbors=self.n_neighbours, n_jobs=-1)

    def model_fitter(self):

        """
        Fitting NearestNeighbors model.
        """

        prepare_data, users_idx = self.data_processor.process_data(data=self.data)
        self.model.fit(prepare_data)
        self.users_idx = users_idx
        return prepare_data

    def model_nearest_neighbours_getter(self):

        """
        Finds nearest neighbour for all items/users.
        :return: nearest neighbours
        """
        prepare_data = self.model_fitter()
        return self.model.kneighbors(prepare_data, n_neighbors=self.n_neighbours)

    @abstractmethod
    def predictions_counter(self):

        """
        Making predictions. Should be overwritten for each approach.
        :return:
        """
        return


In [None]:
from models.ModelInterface import ModelInterface


class UserToUserApproach(ModelInterface):

    """
    Prediction model for User to User approach.
    Checking distance between users with provided metric.
    Making predictions for each user with predictions_counter function
    """

    def predictions_counter(self):

        """
        Counting neighbours and distances for each user.
        Adding neighbours products in recommendations.
        """
        distances, neighbours = self.model_nearest_neighbours_getter()
        prediction_dict = dict()
        for idx, user in enumerate(self.users_idx):
            predictions = []
            for each in neighbours[idx][1::]:
                predictions += list(self.data.loc[self.data['row'] == each]['col'].values)
                if len(predictions) >= 10:
                    break
            prediction_dict[user] = predictions
        return prediction_dict


In [None]:
import ml_metrics as metrics


class SolutionAnalysis:

    """Counts MAP@10 score for model predictions"""

    def __init__(self, prediction, test_dataframe):
        self.prediction = prediction
        self.test = test_dataframe

    def count_map_at_10(self):
        """Counts mapk10 from ml_metrics"""
        self.test_reconfiguration()
        return metrics.mapk(self.prediction, self.test, 10)

    def test_reconfiguration(self):

        """
        Reconfigure self.test dataset intp valid form vor score counting.
        :return:
        """

        test_results = []
        prediction_results = []
        for each in self.test['row'].unique():
            test_interactions = list(self.test.loc[self.test['row'] == each]['col'].values)
            if len(test_interactions) > 0 and each in self.prediction.keys():
                test_results.append(test_interactions)
                prediction_results.append(self.prediction[each])
        self.test = test_results
        self.prediction = prediction_results


In [None]:
from collections import Counter
from models.ModelInterface import ModelInterface


class ItemToItemApproach(ModelInterface):

    """
    Makes predictions with item to item approach.
    Finds nearest items for each userItem.
    Counts appearance of item for each users item recommendation with distance weight.
    Recommends top10 items with most weights for user.
    """

    def make_prediction_for_user(self, prediction_dict):
        users_list = self.data['row'].unique()
        return_dict = dict()
        for each_user in users_list:
            counter = Counter()
            for each_item in self.data.loc[self.data['row'] == each_user]['col'].values:
                if each_item in prediction_dict.keys():
                    for each_predicted_item, item_distance in zip(prediction_dict[each_item][0],
                                                                  prediction_dict[each_item][1]):
                        counter[each_predicted_item] += 1 * (1 - item_distance)
            return_dict[each_user] = [x[0] for x in counter.most_common(10)]
        return return_dict

    def predictions_counter(self):
        distances, neighbours = self.model_nearest_neighbours_getter()
        prediction_dict = dict()
        for idx, item in enumerate(self.users_idx):
            prediction_dict[item] = (neighbours[idx], distances[idx])
        return self.make_prediction_for_user(prediction_dict)


In [None]:
from models.DataProcessor import DataProcessor
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import random


class DataProcessorForUser(DataProcessor):

    def __init__(self, user_region_data, user_age_data):
        self.user_region_data = user_region_data
        self.user_age_data = user_age_data

    @staticmethod
    def data_preparation(user_region_data):

        """
        Making user dictionary from user  region or age
        :param user_region_data:
        :return: dictionary with age or region information.
        """

        df = user_region_data.drop_duplicates(subset='row', keep="last")
        user_region_dict = pd.Series(df.col.values, index=df.row).to_dict()
        return user_region_dict

    def user_information_combiner(self, train_pivot_table, user_region_data, user_age_data):

        """
        Adding information about user region and age into users pivot table.
        :param train_pivot_table: users pivot table
        :param user_region_data: user region dataframe
        :param user_age_data: user age dataframe
        :return: numpy matrix
        """

        data_size = len(train_pivot_table.index)

        regions = np.zeros(data_size, dtype=np.uint8)
        ages = np.zeros(data_size, dtype=np.uint8)

        user_region_dict = self.data_preparation(user_region_data)
        user_age_dict = self.data_preparation(user_age_data)

        regions_list = user_region_data['col'].unique()
        ages_list = user_age_data['col'].unique()

        for idx, i in enumerate(train_pivot_table.index):

            if i in user_region_dict.keys():
                regions[idx] = user_region_dict[i] / 10
            else:
                regions[idx] = random.choice(regions_list)

            if i in user_age_dict.keys():
                ages[idx] = user_age_dict[i]
            else:
                ages[idx] = random.choice(ages_list)

        numpy_pivot_table_in_unit8_format = train_pivot_table.values.astype(np.uint8)

        for each in [regions, ages]:
            numpy_pivot_table_in_unit8_format = np.hstack(
                (numpy_pivot_table_in_unit8_format, each.reshape((data_size, 1))))

        return numpy_pivot_table_in_unit8_format

    def process_data(self, data):

        """Data converter from pandas dataframe to sparse matrix"""

        train_pivot_table = data.pivot(
            index='row',
            columns='col',
            values='data',
        ).fillna(0)

        numpy_pivot_table_in_unit8_format = self.user_information_combiner(train_pivot_table, self.user_region_data, self.user_age_data)
        csr_matrix_for_users = csr_matrix(numpy_pivot_table_in_unit8_format)
        return csr_matrix_for_users, train_pivot_table.index


In [None]:
from scipy.sparse import csr_matrix

from models.DataProcessor import DataProcessor
import pandas as pd
import numpy as np


class DataProcessorForItemsWithoutUserInfo(DataProcessor):

    """
    Data processor for item-item approach without Users vectors.
    """

    def __init__(self, item_asset_data, item_price_data, item_subclass_data):
        self.item_asset_data = item_asset_data
        self.item_price_data = item_price_data
        self.item_subclass_data = item_subclass_data

    @staticmethod
    def data_preparation(data):

        """
        Converting information about item assets and prices into dictionary
        :param data:
        :return: dictionary of items
        """

        df = data.drop_duplicates(subset='row', keep="last")
        item_asset_dict = pd.Series(df.data.values, index=df.row).to_dict()
        return item_asset_dict

    @staticmethod
    def data_preparation_for_subclasses(data):

        """
        Converting information about item subclass into dictionary
        :param data:
        :return: dictionary of items
        """

        df = data.drop_duplicates(subset='row', keep="last")
        item_asset_dict = pd.Series(df.col.values, index=df.row).to_dict()
        return item_asset_dict

    def process_data(self, data):

        """
        Data converter from pandas dataframe to sparse matrix
        :param data:
        :return: csr_matrix with items
        """

        item_assets_dict = self.data_preparation(self.item_asset_data)
        item_prices_dict = self.data_preparation(self.item_price_data)
        item_subclasses_dict = self.data_preparation_for_subclasses(self.item_subclass_data)

        prepared_data = []
        items_idx = []
        for each in item_assets_dict.keys():
            if each in item_prices_dict.keys() and each in item_subclasses_dict.keys():
                prepared_data.append([item_assets_dict[each] * 100, item_prices_dict[each] * 100, item_subclasses_dict[each] / 1000])
                items_idx.append(each)

        numpy_table = np.array(prepared_data, dtype=float)
        csr_matrix_for_users = csr_matrix(numpy_table)
        print(numpy_table)
        return csr_matrix_for_users, items_idx


In [None]:
from models.DataProcessor import DataProcessor
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd


class DataProcessorForItems(DataProcessor):

    """
    Data processor for items.
    Combines all information about items in csr_matrix
    This version includes users vector(information about users, who interacted items)

    """

    def __init__(self, item_asset_data, item_price_data, item_subclass_data):
        self.item_asset_data = item_asset_data
        self.item_price_data = item_price_data
        self.item_subclass_data = item_subclass_data

    @staticmethod
    def data_preparation(data):

        """
        Collects all items in one dict
        :param data:
        :return: dictionary containing items
        """

        df = data.drop_duplicates(subset='row', keep="last")
        item_asset_dict = pd.Series(df.data.values, index=df.row).to_dict()
        return item_asset_dict

    def items_information_combiner(self, train_pivot_table, item_asset_data, item_price_data, item_subclass_data):

        """
        Adding asset, price and subclass information into item-users pivot table.
        :param train_pivot_table: ready pivot table with users as columns, items as rows with values [0, 1]
        :param item_asset_data: item_asset dataframe
        :param item_price_data: item_price dataframe
        :param item_subclass_data: item_subclass dataframe
        :return: numpy matrix
        """

        data_size = len(train_pivot_table.index)

        assets = np.zeros(data_size, dtype=np.uint8)
        prices = np.zeros(data_size, dtype=np.uint8)
        subclasses = np.zeros(data_size, dtype=np.uint8)

        item_assets_dict = self.data_preparation(item_asset_data)
        item_prices_dict = self.data_preparation(item_price_data)
        item_subclasses_dict = self.data_preparation(item_subclass_data)

        assets_mean = item_asset_data['data'].mean()
        prices_mean = item_price_data['data'].mean()
        subclasses_mean = item_subclass_data['data'].mean()

        for idx, i in enumerate(train_pivot_table.index):

            try:
                assets[idx] = item_assets_dict[i]
            except IndexError:
                assets[idx] = assets_mean

            try:
                prices[idx] = item_prices_dict[i]
            except IndexError:
                prices[idx] = prices_mean

            try:
                subclasses[idx] = item_subclasses_dict[i]
            except IndexError:
                subclasses[idx] = subclasses_mean

        numpy_pivot_table_in_unit8_format = train_pivot_table.values

        for each in [assets, prices, subclasses]:
            numpy_pivot_table_in_unit8_format = np.hstack(
                (numpy_pivot_table_in_unit8_format, each.reshape((data_size, 1))))

        return numpy_pivot_table_in_unit8_format

    def process_data(self, data):

        """
        Data converter from pandas dataframe to sparse matrix
        """

        train_pivot_table = data.pivot(
            index='col',
            columns='row',
            values='data'
        ).fillna(0)
        numpy_pivot_table_in_unit8_format = train_pivot_table.values.astype(np.uint8)
        csr_matrix_for_users = csr_matrix(numpy_pivot_table_in_unit8_format)
        return csr_matrix_for_users, train_pivot_table.index


In [None]:
from abc import ABCMeta, abstractmethod


class DataProcessor:

    """
    Interface class for DataProcessors
    """

    __metaclass__ = ABCMeta

    @abstractmethod
    def process_data(self, data):
        return


In [None]:
from models.CombinedApproach import CombinedApproach
from models.DataProcessorForItems import DataProcessorForItems
from models.DataProcessorForUser import DataProcessorForUser
from models.ItemToItemApproach import ItemToItemApproach
from models.SolutionAnalysis import SolutionAnalysis
from models.UserToUserApproach import UserToUserApproach


class CustomGridSearch:

    """
    Custom grid search. Counts scores for different parameters. Returns best params with highest score
    """

    def __init__(
            self,
            metrics_list,
            n_neighbours_list,
            data,
            test,
            item_asset,
            item_price,
            item_subclass,
            user_region,
            user_age):

        self.metrics_list = metrics_list
        self.n_neighbours_list = n_neighbours_list
        self.data = data
        self.test = test
        self.item_asset = item_asset
        self.item_price = item_price
        self.item_subclass = item_subclass
        self.user_region = user_region
        self.user_age = user_age
        self.max_score = 0
        self.grid_result = []

    def grid_search(self):
        print("Starting grid search")
        for metric in self.metrics_list:
            for n_neighbours in self.n_neighbours_list:
                print('Params: ' + metric + ' ' + str(n_neighbours))

                user_to_user = UserToUserApproach(
                    self.data,
                    metric,
                    n_neighbours,
                    DataProcessorForUser(self.user_region, self.user_age))

                item_to_item = ItemToItemApproach(
                    self.data,
                    metric,
                    n_neighbours,
                    DataProcessorForItems(self.item_asset, self.item_price, self.item_subclass))

                # combined_method = CombinedApproach(
                #     self.data,
                #     metric,
                #     n_neighbours,
                #     self.item_asset,
                #     self.item_price,
                #     self.item_subclass,
                #     self.user_region,
                #     self.user_age)

                user_analyzer = SolutionAnalysis(user_to_user.predictions_counter(), self.test)
                user_score = user_analyzer.count_map_at_10()

                item_analyzer = SolutionAnalysis(item_to_item.predictions_counter(), self.test)
                item_score = item_analyzer.count_map_at_10()

                # combined_analyzer = SolutionAnalysis(combined_method.predictions_counter(), self.test)
                # combined_score = combined_analyzer.count_map_at_10()

                print('user_score: ' + str(user_score))
                print('item_score: ' + str(item_score))
                # print('combined_score: ' + str(combined_score))

                if user_score > self.max_score:
                    self.grid_result = [metric, n_neighbours, 'user_to_user']
                    self.max_score = user_score

                if item_score > self.max_score:
                    self.grid_result = [metric, n_neighbours, 'item_to_item']
                    self.max_score = item_score

                # if combined_score > self.max_score:
                #     self.grid_result = [metric, n_neighbours, 'combined_model']
                #     self.max_score = combined_score

    def best(self):
        return self.max_score, self.grid_result


In [None]:
import random

from models.DataProcessorForItems import DataProcessorForItems
from models.DataProcessorForUser import DataProcessorForUser
from models.ItemToItemApproach import ItemToItemApproach
from models.UserToUserApproach import UserToUserApproach


class CombinedApproach:

    """
    Combines user_to_user approach with item_to_item approach.
    Counts predictions for user_to_user approach
    Counts predictions for item_to_item approach
    Finding intersections between predictions for each user
    If intersection is less than 10, randomly adding items from predictions union.
    """

    def __init__(self, data, metrics, n_neighbours, item_asset, item_price, item_subclass, user_region, user_age):
        self.item_to_item = ItemToItemApproach(
            data,
            metrics,
            n_neighbours,
            DataProcessorForItems(item_asset, item_price, item_subclass))
        self.user_to_user = UserToUserApproach(
            data,
            metrics,
            n_neighbours,
            DataProcessorForUser(user_region, user_age))

    def get_multiple_prediction(self):

        items_answer = self.item_to_item.predictions_counter()
        users_answer = self.user_to_user.predictions_counter()
        combined_answer = dict()

        for k, v in items_answer.items():
            updated_set = set.intersection(set(v), set(users_answer[k]))
            if len(updated_set) < 10:
                while not len(updated_set) == 10:
                    updated_set.add(random.choice(v + users_answer[k]))
            combined_answer[k] = list(updated_set)
        return combined_answer

    def predictions_counter(self):
        return self.get_multiple_prediction()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from models.CustomGridSearch import CustomGridSearch

"""
Reading data
"""
interactions = pd.read_csv('data/interactions.csv')
item_subclass = pd.read_csv('data/item_subclass.csv')
item_price = pd.read_csv('data/item_price.csv')
item_asset = pd.read_csv('data/item_asset.csv')
user_region = pd.read_csv('data/user_region.csv')
user_age = pd.read_csv('data/user_age.csv')

"""
Splitting data to train and test. Adding users, who wasn't included in train or test with zero vectors
"""

train, test = train_test_split(interactions, test_size=0.2, random_state=42, shuffle=True)
mismatch_set = set(item_subclass['row']).difference(set(train['col'].values))
new_dataframe = pd.DataFrame(data={
    'row': [0 for _ in mismatch_set],
    'col': list(mismatch_set),
    'data': [0 for _ in mismatch_set]})
mismatch_set_test = set(item_subclass['row']).difference(set(test['col'].values))
new_dataframe_test = pd.DataFrame(data={
    'row': [0 for _ in mismatch_set_test],
    'col': list(mismatch_set_test),
    'data': [0 for _ in mismatch_set_test]})
train = pd.concat([train, new_dataframe])
test = pd.concat([test, new_dataframe_test])

"""
Params for grid search
"""
metrics = ['cityblock', 'cosine', 'euclidean']
n_neighbours_list = [3]


"""Grid search """
grid = CustomGridSearch(metrics,
                        n_neighbours_list,
                        train,
                        test,
                        item_asset,
                        item_price,
                        item_subclass,
                        user_region,
                        user_age)

grid.grid_search()
print(grid.best())
