In [3]:
"""
Use content based to weight the ratings, the cluster would work here to impute

 use a weighting system controlled by alpha parameter

 combined_similarity = alpha*content_sim+(1-alpha)*collb_sim
 alpha = 1 then content_only
 alpha = 0 then collab only
 alpha can be tuned
 http://facweb.cs.depaul.edu/mobasher/research/papers/ewmf04-web/node9.html
"""

import os
from warnings import warn

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
    
from recipe_similarities.utils.clean_data import prepare_data
from recipe_similarities.config.defaults import raw_data_files

BASE_DIR = os.path.dirname(os.path.realpath(__file__))


class SimilarityFactory:
    def __init__(self,
                 recipes_info_file_path=None,
                 similarity_score_file_path=None,
                 recipe_ids=None):

        self.recipes_info_file_path = recipes_info_file_path
        self.similarity_score_file_path = similarity_score_file_path
        self.recipe_ids = recipe_ids
        self.recipes_df = None
        self.sim_scores_df = None

    def load_data(self):
        if self.recipes_info_file_path is None or self.similarity_score_file_path is None:
            warn("You have not provided a recipes.csv AND a similarity_scores.csv. Default raw data will be used. ")
            raw_data = raw_data_files()

            self.recipes_info_file_path = os.path.join(BASE_DIR,
                                                       'data',
                                                       'raw_data',
                                                       raw_data['recipes_info'])
            self.similarity_score_file_path = os.path.join(BASE_DIR,
                                                           'data',
                                                           'raw_data',
                                                           raw_data['similarity_scores'])

            self.recipes_df, self.sim_scores_df = prepare_data(self.recipes_info_file_path,
                                                               self.similarity_score_file_path)

    def _prep_time_class(field):
        """
        This function converts preparation time to a class
        nb highly subjective, should ideally be validated by understanding customer perception
        """

        if field <= 20:
            return 'fast'
        elif 20 > field <= 40:
            return 'medium'
        elif field > 40:
            return 'slow'

    def _recipes_prepare_data(self):

        df = self.recipes_df.copy()

        df.index = df.recipe_id
        del df['recipe_id']
        del df['country_secondary']

        # treat missing content as information
        df.fillna('missing', inplace=True)
        df['family_friendly'].replace(to_replace={'no': 'family unfriendly',
                                                  'yes': 'family friendly'},
                                      inplace=True)

        df['dish_category'].replace(to_replace={'protein&veg': 'protein & veg'},
                                    inplace=True)

        df['prep_time'] = df['prep_time'].apply(self._prep_time_class)

        return df

    def recipes_jaccard_similarities(self):
        # utilise numpys matrix operations for fast computation
        df = self._recipes_prepare_data()

        a = df.values.copy()
        b = df.values.copy()

        all_recipes_by_n_recipes = np.repeat(a[np.newaxis, :, :],
                                             a.shape[0],
                                             axis=0)

        all_recipes = b.reshape(b.shape[0],
                                1,
                                b.shape[1])

        intersect = np.sum(all_recipes_by_n_recipes == all_recipes, axis=2)
        union = np.sum(all_recipes_by_n_recipes != all_recipes, axis=2) * 2 + intersect
        jaccard_sim = intersect / union

        jaccard_sim_df = pd.DataFrame(jaccard_sim, index=df.index, columns=df.index)

        return jaccard_sim_df

    def _concat_to_pipe_delim_str(self, fields):
        all_col_values = fields.tolist()
        return "||".join(all_col_values)

    def _custom_tokeniser(doc):
        return doc.split("||")

    def recipe_cosine_similarities(self):
        df = self._recipes_prepare_data()
        df['delim_str'] = df.apply(self._concat_to_pipe_delim_str, axis=1)
        tf_idf = TfidfVectorizer(analyzer='word',
                                 min_df=0,
                                 tokenizer=self._custom_tokeniser
                                 )
        tfidf_matrix = tf_idf.fit_transform(df['delim_str'])
        cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
        cos_sim_df = pd.DataFrame(df, index=df.index, columns=df.index)
        return cos_sim_df

    def scored_cosine_similarities(self):
        df = self.sim_scores_df
        edges = df.groupby(['recipe_a', 'recipe_b'])\
                  .agg('mean')\
                  .reset_index()[['recipe_a', 'recipe_b', 'score']]
        graph = nx.from_pandas_dataframe(edges, 'recipe_a', 'recipe_b', 'score')
        adj_matrix = nx.to_numpy_matrix(graph, weight='score')
        nodes = [int(n) for n in graph.nodes()]
        adj_matrix_df = pd.DataFrame(adj_matrix, index=nodes, columns=nodes)
        cos_sim = cosine_similarity(adj_matrix_df, adj_matrix_df)
        cos_sim_df = pd.DataFrame(cos_sim, index=adj_matrix_df.index, columns=adj_matrix_df.index)

    def _join_weights_to_recipes(self):
        pass

    def hybrid_weighted_similarity(self):
        pass

ImportError: No module named 'recipe_similarities'