<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/Other/Analysis/RFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [93]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer

In [None]:
# запросим предагреггированные данные
sql = """
SELECT
  card as client_id,
  Max((SELECT max(datetime) :: date FROM bonuscheques) - datetime :: date) as days_passed,
  count(*) as cnt,
  sum(summ_with_disc) as summ
FROM
  bonuscheques
GROUP BY
  card
"""
data = select(sql)

In [None]:
# посмотрим, как выглядят данные
data.head()

Unnamed: 0,client_id,days_passed,cnt,summ
0,ece474c7-8a08-4853-8e6c-bc2d7043df7d,328,25,11577
1,2000200229544,234,2,1657
2,564a56ba-d8ae-43e4-ab1a-702198cb0c4c,317,14,12111
3,2000200171775,236,2,1033
4,2000200229094,165,2,1235


Видим аномальный максимум в признаке cnt (Кто-то покупает почти каждый день?)

In [23]:
data = pd.read_csv('/content/bonuscheques.csv')

In [62]:
rfm_params = {'groupby_col': 'card', 'recentcy_col': 'datetime', 'monetary_col': 'summ_with_disc'}

In [None]:
data

Unnamed: 0,datetime,shop,card,bonus_earned,bonus_spent,summ,summ_with_disc,doc_id,date
0,2021-07-13 12:56:09.000000,Аптека 2,2000200195023,51,0,3400,3400,15#2002741#65938#2_29,2021-07-13
1,2021-07-30 10:42:00.000000,Аптека 2,2000200193494,57,0,747,747,15#2002972#65955#2_5,2021-07-30
2,2021-10-11 12:55:23.000000,Аптека 2,2000200199106,92,253,3077,3077,15#2004060#66028#2_29,2021-10-11
3,2021-10-14 14:48:56.000000,Аптека 2,2000200168768,1,0,54,54,15#2004107#66031#2_57,2021-10-14
4,2021-10-20 11:09:39.000000,Аптека 2,2000200226314,101,0,1733,1733,15#2004192#66037#2_16,2021-10-20
...,...,...,...,...,...,...,...,...,...
38481,2022-06-09 20:45:32.000000,Аптека 11,2000200244974,32,0,1341,1341,15#18001790#66269#18_120,2022-06-09
38482,2022-06-09 20:55:32.000000,Аптека 11,2000200239246,38,0,1445,1445,15#18001790#66269#18_123,2022-06-09
38483,2022-06-09 20:59:41.000000,Аптека 11,2000200238867,57,0,1912,1912,15#18001790#66269#18_126,2022-06-09
38484,2022-06-09 21:07:59.000000,Аптека 11,2000200165811,2,0,278,278,15#18001790#66269#18_129,2022-06-09


In [254]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        groupby_col:str='card',
        recentcy_col:str='datetime',
        monetary_col:str='summ_with_disc',
        bins_quantiles:list|dict=[0.6]
    ):
        self.groupby_col = groupby_col
        self.recentcy_col = recentcy_col
        self.monetary_col = monetary_col
        self.bins_quantiles = bins_quantiles

        pass

    def fit(self, X):
        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )
        return self

    def transform(self, X):
        if all(
            [
                self.groupby_col in X.columns,
                self.recentcy_col in X.columns,
                self.monetary_col in X.columns,
            ]
        ):
            X_ = X[[self.groupby_col, self.recentcy_col, self.monetary_col]].copy()
            X_.rename(columns={self.recentcy_col: 'date', self.monetary_col: 'summ', self.groupby_col: 'id'}, inplace=True)
            X_['date'] = pd.to_datetime(X_['date']).dt.date
        return X_

In [255]:
class WithinGroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self):

        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_ = X.copy()
        group_var = 'id'
        col_m = 'summ'
        col_d = 'date'

        X_.loc[(X[col_m].isna()) & X_[group_var].notna(), col_m] = X_[group_var].map(X_.groupby(group_var)[col_m].mean())
        X_[col_m] = X_[col_m].fillna(X_[col_m].mean())


        X_.loc[(X[col_d].isna()) & X_[group_var].notna(), col_d] = X_[group_var].map(X_.groupby(group_var)[col_d].min())
        X_[col_d] = X_[col_d].fillna(X_[col_d].min())


        return X_

    def set_output(self, **kwargs):
        return self

In [240]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        groupby_col:str='card',
        recentcy_col:str='datetime',
        monetary_col:str='summ_with_disc',
        bins_quantiles:list|dict=[0.6]
    ):
        self.groupby_col = groupby_col
        self.recentcy_col = recentcy_col
        self.monetary_col = monetary_col
        self.bins_quantiles = bins_quantiles

        pass

    def fit(self, X):

        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )

        if isinstance(self.bins_quantiles, list):
            self.bins_quantiles = sorted(self.bins_quantiles + [0, 1])

        return self

    def transform(self, X):
        if not hasattr(X, "iloc"):
            raise ValueError(
                "CustomTransformer can only be applied to pandas dataframes in X argument"
            )

        df = X[[self.groupby_col, self.recentcy_col, self.monetary_col]].copy()
        df.rename(columns={recentcy_col: 'date', monetary_col: 'summ'}, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        df = df.groupby(self.groupby_col).agg({'date': 'max', 'summ': ['count', 'sum']})
        df.columns = ['_'.join(col).strip() for col in df.columns.values]

        return df

    def inverse_transform(self):
        pass

In [252]:
pipe_dt = Pipeline(
    [
        ("col_selector_and_renamer", ColumnSelector(**rfm_params)),
        ("col_imputer", WithinGroupMeanImputer())
    ]
)

In [253]:
pipe_dt.fit_transform(data)

Unnamed: 0,id,date,summ
0,2000200195023,2021-07-13,3400
1,2000200193494,2021-07-30,747
2,2000200199106,2021-10-11,3077
3,2000200168768,2021-10-14,54
4,2000200226314,2021-10-20,1733
...,...,...,...
38481,2000200244974,2022-06-09,1341
38482,2000200239246,2022-06-09,1445
38483,2000200238867,2022-06-09,1912
38484,2000200165811,2022-06-09,278


In [226]:
pd.qcut(range(1, 11), [0, 0.25, 0.5, 0.75, 1], labels=False)

array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3])

In [70]:
transformer = CustomTransformer()

In [71]:
transformer.get_params()

{'bins_quantiles': [0.6],
 'groupby_col': 'card',
 'monetary_col': 'summ_with_disc',
 'recentcy_col': 'datetime'}

In [73]:
transformer.fit_transform(data)

Unnamed: 0_level_0,date_max,summ_count,summ_sum
card,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001848d0-e56a-412e-be0a-423dfa0e564f,2022-05-14,7,4391
001cb0a0-e62e-47a9-995e-599bdc7f8750,2022-05-13,12,5887
00572591-c154-4b9f-83e4-3ce5b00bf9c8,2021-10-08,1,557
00625996-f49f-4863-8132-40d08e99b2da,2022-04-14,4,4755
00703805-f16d-4b83-b7ad-affb2736aa0d,2022-05-25,1,1555
...,...,...,...
ff9695c5-6e8c-4446-adc6-944b2731b8f7,2021-12-19,1,559
ffbf9d44-679e-412a-9927-ac6ce80c49a2,2021-08-27,1,539
ffd22196-662c-4266-b3be-024a42cad78b,2022-06-05,3,2052
ffd93478-ad6d-494f-a6ab-f7f0c8058488,2022-03-02,7,2294


In [181]:
import pandas as pd
import numpy as np

# Создание данных для колонок
groups = np.random.choice(['A', 'B', 'C'], size=100)
numbers = np.random.randint(0, 101, size=100)

# Добавление пропусков в 10% случаев
mask = np.random.rand(100) < 0.3
numbers = numbers.astype(float)
numbers[mask] = np.nan
dates = pd.to_datetime(np.random.choice(pd.date_range(start='2023-01-01', end='2024-01-01'), size=100))

# Добавление пропусков в 10% случаев
mask = np.random.rand(100) < 0.3
dates = pd.Series(dates) # Convert DatetimeIndex to Series for mutability
dates[mask] = pd.NaT     # Now you can modify the Series
dates = pd.DatetimeIndex(dates) # Convert back to DatetimeIndex if needed


# Создание DataFrame
df = pd.DataFrame({'Group': groups, 'Number': numbers, 'Test': numbers[::-1], 'dates': dates})

print(df)

   Group  Number  Test      dates
0      B     6.0   NaN        NaT
1      B     NaN   0.0 2023-12-31
2      A    31.0  94.0 2023-09-25
3      B     NaN  38.0 2023-06-08
4      A    15.0  90.0 2023-10-28
..   ...     ...   ...        ...
95     A    90.0  15.0 2023-09-26
96     A    38.0   NaN 2023-03-24
97     B    94.0  31.0 2023-02-25
98     A     0.0   NaN 2023-07-28
99     B     NaN   6.0 2023-03-13

[100 rows x 4 columns]


In [90]:
np.mean([np.nan, np.nan, 3])

3.0

In [167]:
ColumnSelector().transform(data)

Unnamed: 0,card,date,summ
0,2000200195023,2021-07-13,3400
1,2000200193494,2021-07-30,747
2,2000200199106,2021-10-11,3077
3,2000200168768,2021-10-14,54
4,2000200226314,2021-10-20,1733
...,...,...,...
38481,2000200244974,2022-06-09,1341
38482,2000200239246,2022-06-09,1445
38483,2000200238867,2022-06-09,1912
38484,2000200165811,2022-06-09,278


Index(['id', 'summ'], dtype='object')
Index(['id', 'date'], dtype='object')


Unnamed: 0,summ,id,date
0,3400,2000200195023,2021-07-13
1,747,2000200193494,2021-07-30
2,3077,2000200199106,2021-10-11
3,54,2000200168768,2021-10-14
4,1733,2000200226314,2021-10-20
...,...,...,...
38481,1341,2000200244974,2022-06-09
38482,1445,2000200239246,2022-06-09
38483,1912,2000200238867,2022-06-09
38484,278,2000200165811,2022-06-09


In [117]:
cols_to_pass = ['card', 'datetime', ]

In [176]:
import pandas as pd
import numpy as np

# Создание данных для колонок
groups = np.random.choice(['A', 'B', 'C'], size=100)
dates = pd.to_datetime(np.random.choice(pd.date_range(start='2023-01-01', end='2024-01-01'), size=100))

# Добавление пропусков в 10% случаев
mask = np.random.rand(100) < 0.3
dates = pd.Series(dates) # Convert DatetimeIndex to Series for mutability
dates[mask] = pd.NaT     # Now you can modify the Series
dates = pd.DatetimeIndex(dates) # Convert back to DatetimeIndex if needed

# Создание DataFrame
df = pd.DataFrame({'Group': groups, 'Date': dates})

print(df)

   Group       Date
0      A 2023-09-01
1      C 2023-09-21
2      B 2023-08-13
3      A 2023-02-18
4      A 2023-07-09
..   ...        ...
95     A 2023-07-31
96     B 2023-02-11
97     C 2023-05-03
98     C 2023-05-16
99     A 2023-05-28

[100 rows x 2 columns]


In [162]:
WithinGroupMeanImputer_dates().fit_transform(df)

Unnamed: 0,Group,Date
0,A,2023-10-12
1,C,2023-11-22
2,C,2023-06-17
3,A,2023-02-23
4,A,2023-01-17
...,...,...
95,B,2023-12-06
96,B,2023-01-23
97,B,2023-01-23
98,B,2023-05-26


In [161]:
df.isna().sum()

Unnamed: 0,0
Group,0
Date,35
