In [1]:
%%capture
!pip3 install pandas

In [2]:
import pandas as pd
from pprint import pprint


known_utm_combinations = [
    ["linkedin", "apply_directly_linkedin"],  # Apply for job directy through linkedin
    ["linkedin", "linkedin_profile_page"],
    ["linkedin", "msg_linkedin"], # Find someone in a company that is hiring

    ["email", "msg_email"], # Message someone directly

    ["github", "cv_repository_readme"], #Link placed in readme.md

    ["pdf", "backend-developer-v1"], # Placed in pdf
    ["pdf", "product-manager-v1"]
]

known_utm_combinations_df = pd.DataFrame(known_utm_combinations, columns=['utm_source', 'utm_campaign'])

unique_utm_sources = known_utm_combinations_df['utm_source'].unique()
unique_utm_campaigns = known_utm_combinations_df['utm_campaign'].unique()

# Make analytics for certain period of time
days_ago = 30

# Set true if there is no data available
mock = True

# Timezone
timezone = "Etc/UTC"

In [3]:
import requests
import pytz

from datetime import datetime, timedelta
from random import randrange


fields_stats_for_simple_analytics = [
    "pageviews", # the total amount of page views in the specified period
    "visitors", # the total amount of visitors (unique page views) in the specified period
    "histogram", # an array with page views and visitors per day
    "countries", # a list of country codes
    "utm_sources",
    "utm_campaigns",
    "referrers",
    "seconds_on_page" # the median of seconds a visitor spent on the page
]


def mock_simple_analytics_stats(stats):
    stats = stats.copy()
    stats['pageviews'] = 0
    stats['visitors'] = 0
    
    for histogram in stats['histogram']:
        page_views = randrange(0, 100)
        visitors = page_views // randrange(3, 5)
        
        histogram['pageviews'] = page_views
        histogram['visitors'] = visitors
        
        # Add to overall stats
        stats['pageviews'] += page_views
        stats['visitors'] += visitors
    
    
    stats['seconds_on_page'] = randrange(1, 35)
    
    # It is possible to populate using average amount of visitors, distribute over know utm tags and non-utm visitors
    stats['utm_campaigns'] = list(map(lambda value: {
        'pageviews': randrange(1, 35),
        'seconds_on_page': randrange(1, 30),
        'value': value,
        'visitors': randrange(10, 50),
    }, unique_utm_campaigns))

    stats['utm_sources'] = list(map(lambda value: {
        'pageviews': randrange(1, 35),
        'seconds_on_page': randrange(1, 30),
        'value': value,
        'visitors': randrange(10, 50),
    }, unique_utm_sources))
    
    return stats
    


def convert_and_filter_utm_params(stats_utm, known_utm_values):
    df = pd.DataFrame(stats_utm)
    df = df[df['value'].isin(known_utm_values)]
    return df if not df.empty else None


def fetch_simple_analytics_stats():
    fields_stats_for_simple_analytics_str = ','.join(fields_stats_for_simple_analytics)
    url = f"https://simpleanalytics.com/artbred.io.json?info=false&version=5&fields={fields_stats_for_simple_analytics_str}&timezone={timezone}"
    
    current_date = datetime.now(pytz.timezone(timezone)).date()
    days_before = current_date - timedelta(days=days_ago)
    
    url += f"&start={days_before}&end={current_date}"

    response = requests.get(url, headers={
        "Content-Type": "application/json",
    })
        
    stats = response.json()
    if not stats['ok']:
        raise ValueError(stats)
    
    if mock:
        stats = mock_simple_analytics_stats(stats)

    return stats


def get_simple_analytics_stats():
    stats = fetch_simple_analytics_stats()

    if 'countries' in stats:
        del stats['countries']

    stats['histogram'] = pd.DataFrame(stats['histogram'])
    stats['histogram']['date'] = pd.to_datetime(stats['histogram']['date'])
    stats['histogram'].set_index('date', inplace=True)

    # Delete unknown utm params and convert to pandas data frame
    stats['utm_sources'] = convert_and_filter_utm_params(stats['utm_sources'], unique_utm_sources)
    stats['utm_campaigns'] = convert_and_filter_utm_params(stats['utm_campaigns'], unique_utm_campaigns)

    stats['referrers'] = pd.DataFrame(stats['referrers'])

    return stats


simple_analytics_stats = get_simple_analytics_stats()
pprint(simple_analytics_stats['histogram'])
print('-' * 40)
pprint(simple_analytics_stats['utm_campaigns'])
print('-' * 40)
pprint(simple_analytics_stats['utm_sources'])

            pageviews  visitors
date                           
2023-01-16          8         2
2023-01-17         68        17
2023-01-18         81        20
2023-01-19         60        15
2023-01-20         70        17
2023-01-21         95        31
2023-01-22         53        13
2023-01-23         65        21
2023-01-24         22         5
2023-01-25          8         2
2023-01-26         75        18
2023-01-27         54        18
2023-01-28         51        17
2023-01-29         95        31
2023-01-30         37        12
2023-01-31         58        14
2023-02-01         13         4
2023-02-02         20         5
2023-02-03         79        26
2023-02-04         52        17
2023-02-05         77        25
2023-02-06         19         6
2023-02-07         99        24
2023-02-08         68        22
2023-02-09         30         7
2023-02-10         26         8
2023-02-11         54        13
2023-02-12         55        13
2023-02-13          1         0
2023-02-

In [4]:
%%capture
!pip3 install redis python-dotenv

In [5]:
import sys
import time
import json
import string
import pandas as pd
import requests
import random

sys.path.append('../app')

from storage import create_redis_connection, labels_prefix_key, utm_params_set_prefix_key, decode_redis_data, downloads_by_label_id_set_key, query_positions_set_key

def api_call(query_position, **kwargs):
    params = {k: v for k, v in kwargs.items() if v is not None}
    query_url = '&'.join([f"{k}={v}" for k, v in params.items()])
    response = requests.post("http://127.0.0.1:4567/score?" + query_url, json.dumps({"position": query_position}))
    if response.status_code == 200:
        requests.post("http://127.0.0.1:4567/download?" + query_url, json.dumps({"token": response.json()['token']}))

def fill_redis_with_fake_data():
    if mock and input("Do you want to fill redis with fake data?") == "yes":
        # Define probabilities
        positions = {"backend developer": 0.4, "product manager": 0.6}
        real_position_probability = 0.7
        modify_real_position_probability = 0.85
        utm_params_probability = 0.8

        for i in range(100):
            position, utm_campaign, utm_source = '', None, None

            if random.random() < real_position_probability:
                position = random.choices(list(positions.keys()), weights=list(positions.values()))[0]

                if random.random() < modify_real_position_probability:
                    num_chars_to_replace = random.randrange(0, len(position) // 4)
                    indices_to_replace = random.sample(range(len(position)), num_chars_to_replace)
                    random_string = ''.join(random.choices(string.ascii_letters, k=num_chars_to_replace))
                    modified_position = "".join([random_string[indices_to_replace.index(i)] if i in indices_to_replace else position[i] for i in range(len(position))])
                    position = modified_position
            else:
                position = ''.join(random.choices(string.ascii_letters, k=random.randrange(5, 45)))

            if random.random() < utm_params_probability:
                utm_campaign = random.choice(unique_utm_campaigns)
                if random.random() < 0.9:
                    utm_source = random.choice(unique_utm_sources)

            api_call(position, utm_source=utm_source, utm_campaign=utm_campaign)

def get_labels_data_from_redis(conn):
    labels_list = []

    for label_key in conn.keys(labels_prefix_key + "*"):
        label_byte = conn.hgetall(label_key)
        label = decode_redis_data(label_byte)
        labels_list.append(label)

    return pd.DataFrame(labels_list, columns=["id", "position"])

def get_utm_params_from_redis(conn):
    utm_params = {}

    time_now = int(time.time())
    start_time = time_now - (days_ago * 24 * 60 * 60)

    for utm_param_key_byte in conn.keys(utm_params_set_prefix_key + "*"):
        utm_param_key = decode_redis_data(utm_param_key_byte)
        endpoint = utm_param_key.replace(utm_params_set_prefix_key, "")
        
        utm_params_for_endpoint_byte = conn.zrangebyscore(utm_param_key_byte, start_time, time_now)
        utm_params_for_endpoint = decode_redis_data(utm_params_for_endpoint_byte)
        utm_params_for_endpoint_df = pd.DataFrame(utm_params_for_endpoint)

        utm_params_mask = (utm_params_for_endpoint_df["utm_source"].isin(unique_utm_sources) | utm_params_for_endpoint_df["utm_campaign"].isin(unique_utm_campaigns))
        utm_params_for_endpoint_df_filtered = utm_params_for_endpoint_df.loc[utm_params_mask].copy()
        utm_params_for_endpoint_df_filtered.loc[:, "date"] = pd.to_datetime(utm_params_for_endpoint_df['timestamp'], utc=True, unit='s').dt.date
        utm_params_for_endpoint_df_filtered.set_index("date", inplace=True)

        utm_params[endpoint] = utm_params_for_endpoint_df_filtered

        return utm_params 
    

def get_downloads_data_from_redis(conn):
    time_now = int(time.time())
    start_time = time_now - (days_ago * 24 * 60 * 60)

    downloads_bytes = conn.zrangebyscore(downloads_by_label_id_set_key, start_time, time_now)
    downloads_list = decode_redis_data(downloads_bytes)
    downloads_df = pd.DataFrame(downloads_list)

    downloads_df["date"] = pd.to_datetime(downloads_df["timestamp"], utc=True, unit='s').dt.date
    downloads_df.set_index("date", inplace=True)

    return downloads_df


def get_data_from_redis():
    with create_redis_connection() as conn:
        labels_df = get_labels_data_from_redis(conn)
        utm_params_endpoints = get_utm_params_from_redis(conn)
        downloads_df = get_downloads_data_from_redis(conn)
        return labels_df, utm_params_endpoints, downloads_df
    

fill_redis_with_fake_data()

labels_df, utm_params_endpoints, downloads_df = get_data_from_redis()
print(labels_df)
print('-' * 40)
print(utm_params_endpoints)
print('-' * 40)
print(downloads_df)

                  id           position
0  labels:3a909ae2c4    product manager
1  labels:0fc3c8919f  backend developer
----------------------------------------
{'score':              timestamp utm_source             utm_campaign
date                                                      
2023-02-15  1676503971        pdf     cv_repository_readme
2023-02-15  1676503973        NaN                msg_email
2023-02-15  1676503973        pdf     cv_repository_readme
2023-02-15  1676503974      email     backend-developer-v1
2023-02-15  1676503975      email  apply_directly_linkedin
...                ...        ...                      ...
2023-02-15  1676504021        pdf       product-manager-v1
2023-02-15  1676504022     github  apply_directly_linkedin
2023-02-15  1676504022     github       product-manager-v1
2023-02-15  1676504024      email     backend-developer-v1
2023-02-15  1676504024   linkedin                msg_email

[72 rows x 3 columns]}
--------------------------------------

In [6]:
import pickle

file_name = 'analytics_fake' if mock else 'analytics'

with open(f'../data/analytics/{file_name}.pickle', 'wb') as f:
    pickle.dump({
        "simple_analytics": simple_analytics_stats,
        "labels": labels_df,
        "utm_params_endpoints": utm_params_endpoints,
        "downloads": downloads_df
    }, f)