In [147]:
import json

import pandas as pd
import datetime

from dataclasses import dataclass

In [148]:
@dataclass
class DateParser:
    date_format: str = "%Y-%m-%d"

    def __call__(self, x):
        return datetime.datetime.strptime(x, self.date_format)

In [149]:
@dataclass
class CsvFileParser:
    file_path: str
    date_columns: list = None

    def parse(self):
        return pd.read_csv(self.file_path, parse_dates=self.date_columns, date_parser=DateParser())

In [150]:
class JsonParser:
    def __init__(self, file_path):
        self.file_path = file_path

    def parse(self):
        data = []
        with open(self.file_path, 'r') as file:
            for line_number, line in enumerate(file, start=1):
                try:
                    # Try to parse the line as JSON
                    json_data = json.loads(line)
                    data.append(json_data)
                except json.JSONDecodeError as e:
                    # Register the error and continue reading the file
                    print(f"Error parsing JSON on line {line_number}: {e.msg}")
                except Exception as e:
                    # Capture any other exception and break the loop
                    print(f"Unexpected error on line {line_number}: {str(e)}")
                    break  # break if there is an error an unexpected error
        return data

In [151]:
@dataclass
class DataProcessor:
    data: list
    date_columns: list = None

    def process(self):
        data = []
        for element in self.data:
            day = element['day']
            event_data = element['event_data']
            position = event_data['position']
            value_prop = event_data['value_prop']
            user_id = element['user_id']
            data.append([day, position, value_prop, user_id])
        
        df = pd.DataFrame(data, columns=['day', 'position', 'value_prop', 'user_id'])
        
        for column in self.date_columns:
            df[column] = pd.to_datetime(df[column])

        return df

In [152]:
json_parser = JsonParser('./data/prints.json')
prints_data = json_parser.parse()

data_processor = DataProcessor(prints_data, date_columns=['day'])
prints_df = data_processor.process()

prints_df["showed"] = 1

print(prints_df.head())

         day  position          value_prop  user_id  showed
0 2020-11-01         0  cellphone_recharge    98702       1
1 2020-11-01         1             prepaid    98702       1
2 2020-11-01         0             prepaid    63252       1
3 2020-11-01         0  cellphone_recharge    24728       1
4 2020-11-01         1          link_cobro    24728       1


In [153]:
json_parser = JsonParser('./data/taps.json')
taps_data = json_parser.parse()

taps_df = DataProcessor(taps_data, date_columns=['day']
                        ).process()

taps_df["clicked"] = 1

print(taps_df.head())

         day  position          value_prop  user_id  clicked
0 2020-11-01         0  cellphone_recharge    98702        1
1 2020-11-01         2               point     3708        1
2 2020-11-01         3          send_money     3708        1
3 2020-11-01         0           transport    93963        1
4 2020-11-01         1  cellphone_recharge    93963        1


In [154]:
csv_parser = CsvFileParser('./data/pays.csv', date_columns=['pay_date'])
pays_df = csv_parser.parse()

print(pays_df.head())

  return pd.read_csv(self.file_path, parse_dates=self.date_columns, date_parser=DateParser())


    pay_date  total  user_id          value_prop
0 2020-11-01   7.04    35994          link_cobro
1 2020-11-01  37.36    79066  cellphone_recharge
2 2020-11-01  15.84    19321  cellphone_recharge
3 2020-11-01  26.26    19321          send_money
4 2020-11-01  35.35    38438          send_money


In [155]:
# Merge the dataframes
class DataMerger:
    def __init__(self, prints_df, taps_df, pays_df):
        self.prints_df = prints_df
        self.taps_df = taps_df
        self.pays_df = pays_df

    def merge(self):
        # Merge prints and taps dataframes
        merged_df = pd.merge(self.prints_df, self.taps_df, on=[
            'day', 'position', 'value_prop', 'user_id'], how='outer')
        
        # Merge the previous merged dataframe with the pays dataframe
        merged_df = pd.merge(merged_df, self.pays_df, 
                            left_on=['day', 'user_id', 'value_prop'],
                            right_on=['pay_date', 'user_id', 'value_prop'],
            how='outer')
        

        return merged_df

In [156]:




class BusinessMetrics:
    def __init__(self, merged_df):
        self.merged_df = merged_df

    def calculate_metrics(self):

        # Fill NAN clicks with 0
        self.merged_df['clicked'] = self.merged_df['clicked'].fillna(0)

        # crear una columna que indique si "day" es NaN
        self.merged_df['is_day_nan'] = self.merged_df['day'].isna()
        # fill NAN with pay_date values
        self.merged_df['day'] = self.merged_df['day'].fillna(self.merged_df['pay_date'])
        # sort values by day
        self.merged_df.sort_values(by='day', inplace=True, ascending=True)
        # set day as index
        self.merged_df.set_index('day', inplace=True)
        # llenar con ceros los valores faltantes
        self.merged_df['clicked'] = self.merged_df['clicked'].fillna(0)
        self.merged_df['showed'] = self.merged_df['showed'].fillna(0)
        self.merged_df['pay_date'] = self.merged_df['pay_date'].fillna(0)
        self.merged_df['total'] = self.merged_df['total'].fillna(0)
        # agrupar por usuario y value_prop
        grouped = self.merged_df.groupby(['user_id', 'value_prop'])
        # crear un dataframe con los resultados
        result = pd.DataFrame()
        # contar las veces que se muestra un value_prop en las 3 semanas previas
        result['3_week_showed_count'] = grouped['showed'].rolling(window="21D").sum()
        # contar las veces que se clickea un value_prop en las 3 semanas previas
        result['3_week_clicked_count'] = grouped['clicked'].rolling(window="21D").sum()
        # contar las veces que se paga por un value_prop en las 3 semanas previas
        result['3_week_pay_count'] = grouped['pay_date'].rolling(window="21D").count()
        # contar los importes acumulados de un value_prop en las 3 semanas previas
        result['3_week_pay_amount'] = grouped['total'].rolling(window="21D").sum()
        # reset index
        self.merged_df.reset_index(inplace=True)
        
        return result


In [157]:
merged_df = DataMerger(prints_df, taps_df, pays_df).merge()

business_metrics = BusinessMetrics(merged_df)
calculated_df = business_metrics.calculate_metrics()

In [158]:
calculated_df.to_csv('./data/calculated_data.csv', index=True)

Unnamed: 0,level_0,index,user_id,value_prop,day,3_week_showed_count,3_week_clicked_count,3_week_pay_count,3_week_pay_amount
58,58,58,4,link_cobro,2020-11-20,2.0,1.0,2.0,0.00
59,59,59,4,link_cobro,2020-11-30,2.0,1.0,2.0,0.00
154,154,154,12,prepaid,2020-11-17,1.0,0.0,2.0,10.16
155,155,155,12,prepaid,2020-11-28,1.0,0.0,2.0,10.16
350,350,350,28,credits_consumer,2020-11-25,3.0,0.0,3.0,0.00
...,...,...,...,...,...,...,...,...,...
1245950,1245950,1245950,99941,credits_consumer,2020-11-23,1.0,0.0,1.0,0.00
1246117,1246117,1246117,99952,cellphone_recharge,2020-11-19,1.0,0.0,2.0,51.74
1246118,1246118,1246118,99952,cellphone_recharge,2020-11-28,1.0,0.0,2.0,51.74
1246475,1246475,1246475,99984,cellphone_recharge,2020-11-09,1.0,0.0,2.0,62.44
