<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/Other/Analysis/RFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer

In [None]:
# запросим предагреггированные данные
sql = """
SELECT
  card as client_id,
  Max((SELECT max(datetime) :: date FROM bonuscheques) - datetime :: date) as days_passed,
  count(*) as cnt,
  sum(summ_with_disc) as summ
FROM
  bonuscheques
GROUP BY
  card
"""
data = select(sql)

In [None]:
# посмотрим, как выглядят данные
data.head()

Unnamed: 0,client_id,days_passed,cnt,summ
0,ece474c7-8a08-4853-8e6c-bc2d7043df7d,328,25,11577
1,2000200229544,234,2,1657
2,564a56ba-d8ae-43e4-ab1a-702198cb0c4c,317,14,12111
3,2000200171775,236,2,1033
4,2000200229094,165,2,1235


Видим аномальный максимум в признаке cnt (Кто-то покупает почти каждый день?)

In [3]:
data = pd.read_csv('/content/bonuscheques.csv')

In [4]:
rfm_params = {'groupby_col': 'card', 'recentcy_col': 'datetime', 'monetary_col': 'summ_with_disc'}

In [5]:
data

Unnamed: 0,datetime,shop,card,bonus_earned,bonus_spent,summ,summ_with_disc,doc_id
0,2021-07-13 12:56:09.000000,Аптека 2,2000200195023,51,0,3400,3400,15#2002741#65938#2_29
1,2021-07-30 10:42:00.000000,Аптека 2,2000200193494,57,0,747,747,15#2002972#65955#2_5
2,2021-10-11 12:55:23.000000,Аптека 2,2000200199106,92,253,3077,3077,15#2004060#66028#2_29
3,2021-10-14 14:48:56.000000,Аптека 2,2000200168768,1,0,54,54,15#2004107#66031#2_57
4,2021-10-20 11:09:39.000000,Аптека 2,2000200226314,101,0,1733,1733,15#2004192#66037#2_16
...,...,...,...,...,...,...,...,...
38481,2022-06-09 20:45:32.000000,Аптека 11,2000200244974,32,0,1341,1341,15#18001790#66269#18_120
38482,2022-06-09 20:55:32.000000,Аптека 11,2000200239246,38,0,1445,1445,15#18001790#66269#18_123
38483,2022-06-09 20:59:41.000000,Аптека 11,2000200238867,57,0,1912,1912,15#18001790#66269#18_126
38484,2022-06-09 21:07:59.000000,Аптека 11,2000200165811,2,0,278,278,15#18001790#66269#18_129


In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        groupby_col:str='card',
        recentcy_col:str='datetime',
        monetary_col:str='summ_with_disc',
        bins_quantiles:list|dict=[0.6]
    ):
        self.groupby_col = groupby_col
        self.recentcy_col = recentcy_col
        self.monetary_col = monetary_col
        self.bins_quantiles = bins_quantiles

        pass

    def fit(self, X):
        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )
        return self

    def transform(self, X):
        if all(
            [
                self.groupby_col in X.columns,
                self.recentcy_col in X.columns,
                self.monetary_col in X.columns,
            ]
        ):
            X_ = X[[self.groupby_col, self.recentcy_col, self.monetary_col]].copy()
            X_.rename(columns={self.recentcy_col: 'date', self.monetary_col: 'summ', self.groupby_col: 'id'}, inplace=True)
            X_['date'] = pd.to_datetime(X_['date']).dt.date
        return X_

In [7]:
class WithinGroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self):

        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_ = X.copy()
        group_var = 'id'
        col_m = 'summ'
        col_d = 'date'

        X_.loc[(X[col_m].isna()) & X_[group_var].notna(), col_m] = X_[group_var].map(X_.groupby(group_var)[col_m].mean())
        X_[col_m] = X_[col_m].fillna(X_[col_m].mean())


        X_.loc[(X[col_d].isna()) & X_[group_var].notna(), col_d] = X_[group_var].map(X_.groupby(group_var)[col_d].min())
        X_[col_d] = X_[col_d].fillna(X_[col_d].min())


        return X_

    def set_output(self, **kwargs):
        return self

In [129]:
class RFMClassifier(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        bins_quantiles = None,
        statistics = dict()
    ):
        self.bins_quantiles = bins_quantiles
        self.statistics = statistics

        pass

    def fit(self, X):

        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )

        if isinstance(self.bins_quantiles, list):
            self.bins_quantiles = sorted(self.bins_quantiles + [0, 1])

        return self

    def qcut(self, df):

        q = self.bins_quantiles or [0, 0.33, 0.66, 1]
        while q[2] < 1:
          try:
            result =  pd.qcut(df, q, labels=[3, 2, 1])
          except ValueError:
            q[1] += 0.11
            q[2] = q[1] + (1 - q[1]) / 2
          else:
            self.statistics.update({df.name: q})
            return result

        self.statistics.update({df.name: q})
        return np.where(df > df.median(), 1, 2)


    def transform(self, X):
        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )

        df = X.copy()
        df['R'] = self.qcut(-df['recency'])
        df['F'] = self.qcut(df['frequency'])
        df['M'] = self.qcut(df['monetary'])
        return df

    def inverse_transform(self):
        pass

In [8]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        bins_quantiles:list|dict=[0.6]
    ):
        self.bins_quantiles = bins_quantiles

        pass

    def fit(self, X):

        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )

        if isinstance(self.bins_quantiles, list):
            self.bins_quantiles = sorted(self.bins_quantiles + [0, 1])

        return self

    def transform(self, X):
        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )

        df = X.copy()
        max_date = pd.to_datetime(df['date'].max())

        df = df.groupby('id').agg({'date': 'max', 'summ': ['sum', 'count']})
        df.columns = ['_'.join(col).strip() for col in df.columns.values]
        df['recency'] = (max_date - pd.to_datetime(df['date_max']) ).dt.days
        df.rename(columns={'summ_count': 'frequency', 'summ_sum': 'monetary'}, inplace=True)
        df.drop(columns=['date_max'], inplace=True)
        df.dropna(inplace=True)
        return df

    def inverse_transform(self):
        pass

In [130]:
pipe_dt = Pipeline(
    [
        ("col_selector_and_renamer", ColumnSelector(**rfm_params)),
        ("col_imputer", WithinGroupMeanImputer()),
        ("grouping_transformer", CustomTransformer()),
        ("RFMClassifier", RFMClassifier())
    ]
)

In [131]:
pipe_dt.fit_transform(data)

Unnamed: 0_level_0,monetary,frequency,recency,R,F,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
001848d0-e56a-412e-be0a-423dfa0e564f,4391,7,26,1,1,1
001cb0a0-e62e-47a9-995e-599bdc7f8750,5887,12,27,1,1,1
00572591-c154-4b9f-83e4-3ce5b00bf9c8,557,1,244,3,3,3
00625996-f49f-4863-8132-40d08e99b2da,4755,4,56,2,2,1
00703805-f16d-4b83-b7ad-affb2736aa0d,1555,1,15,1,3,2
...,...,...,...,...,...,...
ff9695c5-6e8c-4446-adc6-944b2731b8f7,559,1,172,3,3,3
ffbf9d44-679e-412a-9927-ac6ce80c49a2,539,1,286,3,3,3
ffd22196-662c-4266-b3be-024a42cad78b,2052,3,4,1,2,2
ffd93478-ad6d-494f-a6ab-f7f0c8058488,2294,7,99,2,1,2


In [132]:
pipe_dt.get_params()

{'memory': None,
 'steps': [('col_selector_and_renamer', ColumnSelector()),
  ('col_imputer', WithinGroupMeanImputer()),
  ('grouping_transformer', CustomTransformer(bins_quantiles=[0, 0.6, 1])),
  ('RFMClassifier', RFMClassifier())],
 'verbose': False,
 'col_selector_and_renamer': ColumnSelector(),
 'col_imputer': WithinGroupMeanImputer(),
 'grouping_transformer': CustomTransformer(bins_quantiles=[0, 0.6, 1]),
 'RFMClassifier': RFMClassifier(),
 'col_selector_and_renamer__bins_quantiles': [0.6],
 'col_selector_and_renamer__groupby_col': 'card',
 'col_selector_and_renamer__monetary_col': 'summ_with_disc',
 'col_selector_and_renamer__recentcy_col': 'datetime',
 'grouping_transformer__bins_quantiles': [0, 0.6, 1],
 'RFMClassifier__bins_quantiles': None,
 'RFMClassifier__statistics': {'recency': [0, 0.33, 0.66, 1],
  'frequency': [0, 0.44, 0.72, 1],
  'monetary': [0, 0.33, 0.66, 1]}}