In [26]:
import pandas as pd
import numpy as np
import ast
"""
Estimate audience overlap across multiple demographic characteristics.

Steps:
1. Load and merge demographic and visitor data.
2. Calculate lower bounds for audience estimates.
3. Apply overlap estimation algorithm for each demographic characteristic.
4. Save results to a CSV file.
"""

'\nEstimate audience overlap across multiple demographic characteristics.\n\nSteps:\n1. Load and merge demographic and visitor data.\n2. Calculate lower bounds for audience estimates.\n3. Apply overlap estimation algorithm for each demographic characteristic.\n4. Save results to a CSV file.\n'

In [27]:
# Step 1: Data Preprocessing/loading data from the datasets
"""
Load demographic and visitor data from CSV files, merge them based on 'tag' and 'date' columns.
"""

demo_data = pd.read_csv("./input/demo.csv")
visitors_data = pd.read_csv('./input/all_visitors.csv')
merged_data = pd.merge(demo_data, visitors_data, on=['tag', 'date'])
# print(demo_data)
# print(visitors_data )
print(merged_data)

               tag        date  \
0             AIF/  2022-12-01   
1             AIF/  2022-12-02   
2             AIF/  2022-12-03   
3             AIF/  2022-12-04   
4             AIF/  2022-12-05   
..             ...         ...   
88  kommersant.ru/  2022-12-27   
89  kommersant.ru/  2022-12-28   
90  kommersant.ru/  2022-12-29   
91  kommersant.ru/  2022-12-30   
92  kommersant.ru/  2022-12-31   

                                                 demo  \
0   {'Men above 45': 270578.0, 'Men 25-34': 248189...   
1   {'Men above 45': 242880.0, 'Men 25-34': 218607...   
2   {'Men above 45': 215736.0, 'Men 25-34': 195687...   
3   {'Men above 45': 205517.0, 'Men 25-34': 181038...   
4   {'Men above 45': 215976.0, 'Men 25-34': 207819...   
..                                                ...   
88  {'Men 25-34': 327395.0, 'Men 18-24': 273582.0,...   
89  {'Men 25-34': 318169.0, 'Men 18-24': 265664.0,...   
90  {'Men 25-34': 337351.0, 'Men 18-24': 299877.0,...   
91  {'Men 25-34': 315

In [31]:
# Step 2: Calculate Lower Bounds
"""
Calculate the lower bound of audience size for each row in the merged data.
"""
def calculate_lower_bound(row):
    try:
        """
        Calculates the lower bound of audience size for a given row.

        Args:
            row (pandas.Series): A row of data containing 'demo' and 'Visitors' values.
    
        Returns:
            int: The maximum value between 'demo' and 'Visitors', representing the lower bound.
        """
        demo_dict = ast.literal_eval(row['demo'])
        
        # Extract the relevant values from the 'demo' dictionary
        demo_value = sum(demo_dict.values())  # Summing up all values in the dictionary
        
        # Convert the 'Visitors' value to float
        visitors_value = float(row['Visitors'])
        
        # Implement the lower bound calculation based on the formula
        return max(demo_value, visitors_value)
    
    except (ValueError, TypeError, SyntaxError):
        # Handle the case where conversion to float fails or 'demo' is not in the expected format
        return np.nan  # or any default value

# Apply the calculate_lower_bound function to create the 'lower_bound' column
merged_data['lower_bound'] = merged_data.apply(calculate_lower_bound, axis=1)
print(merged_data)

               tag        date  \
0             AIF/  2022-12-01   
1             AIF/  2022-12-02   
2             AIF/  2022-12-03   
3             AIF/  2022-12-04   
4             AIF/  2022-12-05   
..             ...         ...   
88  kommersant.ru/  2022-12-27   
89  kommersant.ru/  2022-12-28   
90  kommersant.ru/  2022-12-29   
91  kommersant.ru/  2022-12-30   
92  kommersant.ru/  2022-12-31   

                                                 demo  \
0   {'Men above 45': 270578.0, 'Men 25-34': 248189...   
1   {'Men above 45': 242880.0, 'Men 25-34': 218607...   
2   {'Men above 45': 215736.0, 'Men 25-34': 195687...   
3   {'Men above 45': 205517.0, 'Men 25-34': 181038...   
4   {'Men above 45': 215976.0, 'Men 25-34': 207819...   
..                                                ...   
88  {'Men 25-34': 327395.0, 'Men 18-24': 273582.0,...   
89  {'Men 25-34': 318169.0, 'Men 18-24': 265664.0,...   
90  {'Men 25-34': 337351.0, 'Men 18-24': 299877.0,...   
91  {'Men 25-34': 315

In [33]:
# Step 3: Algorithm to Estimate Audience Overlap
def estimate_overlap(df, demographic_column):
    """
    Estimates audience overlap for a given demographic characteristic.

    Args:
        df (pandas.DataFrame): The merged data with calculated lower bounds.
        demographic_column (str): The name of the demographic column to analyze.
    """
    pairs = df.groupby(['tag', demographic_column])[['lower_bound']].max().reset_index()

    # Estimate joint distribution
    joint_distribution = pairs.groupby(demographic_column)[['lower_bound']].mean()

    # Use the estimated joint distribution to refine the lower bound
    df['refined_lower_bound'] = df.apply(lambda row: max(row['lower_bound'], joint_distribution.loc[row[demographic_column]]['lower_bound']), axis=1)

In [34]:
# Apply the algorithm for each demographic characteristic
demographic_characteristics = ['browsers', 'resolutions', 'oses', 'languages']
for characteristic in demographic_characteristics:
    estimate_overlap(merged_data, characteristic)

In [35]:
print(merged_data[['tag', 'date', 'demo', 'Visitors', 'lower_bound', 'refined_lower_bound']])

               tag        date  \
0             AIF/  2022-12-01   
1             AIF/  2022-12-02   
2             AIF/  2022-12-03   
3             AIF/  2022-12-04   
4             AIF/  2022-12-05   
..             ...         ...   
88  kommersant.ru/  2022-12-27   
89  kommersant.ru/  2022-12-28   
90  kommersant.ru/  2022-12-29   
91  kommersant.ru/  2022-12-30   
92  kommersant.ru/  2022-12-31   

                                                 demo   Visitors  lower_bound  \
0   {'Men above 45': 270578.0, 'Men 25-34': 248189...  1387448.0    1387448.0   
1   {'Men above 45': 242880.0, 'Men 25-34': 218607...  1224443.0    1224443.0   
2   {'Men above 45': 215736.0, 'Men 25-34': 195687...  1108391.0    1108391.0   
3   {'Men above 45': 205517.0, 'Men 25-34': 181038...  1071513.0    1071513.0   
4   {'Men above 45': 215976.0, 'Men 25-34': 207819...  1174182.0    1174182.0   
..                                                ...        ...          ...   
88  {'Men 25-34': 327395

In [38]:
# Step 4: Save the results to a CSV file
"""
Save the final results to a CSV file named 'result.csv' without the index.
"""
result_file_path = './output/result.csv'
merged_data.to_csv(result_file_path, index=False)