In [1]:
import json
import math
import multiprocessing
import os

import httpx
import pandas as pd
import matplotlib.pyplot as plt

from get_random_sample import get_video
from map_funcs import process_amap

def read_result_path(result_path):
    with open(result_path, 'r') as f:
        try:
            results = json.load(f)
        except:
            return []
    return [r for r in results]

data_dir_path = os.path.join("..", "data")

all_videos = []
fetched_videos = []
result_paths = []
for dir_path, dir_names, filenames in os.walk(os.path.join(data_dir_path, 'results', 'hours', '19')):
    for filename in filenames:
        if filename == 'results.json':
            result_paths.append(os.path.join(dir_path, filename))
# for result_path in tqdm.tqdm(result_paths, desc="Reading result files"):

all_results = process_amap(read_result_path, result_paths, num_workers=multiprocessing.cpu_count() - 1, pbar_desc="Reading result files")
all_results = [v for res in all_results for v in res]
fetched_results = [r for r in all_results if r['result']['return']]
fetched_video_ids = [r['args'] for r in fetched_results]
video_bits = [format(v, '064b') for v in fetched_video_ids]

# get distribution of number of videos per millisecond
bit_sections = [{'time_bits': int(b[:32], 2) + int(b[32:42], 2) / 1000, 'counter_bits': b[32+10:32+18], 'geo_bits': b[32+18:]} for b in video_bits]
df = pd.DataFrame(bit_sections)
df['result'] = fetched_results
df['id'] = df['result'].apply(lambda x: x['args'])
df = df.drop_duplicates(subset='id')
df['statusMsg'] = df['result'].apply(lambda x: x['result']['return']['statusMsg'] if 'statusMsg' in x['result']['return'] else 'success')
df['counter_vals'] = df['counter_bits'].apply(lambda x: int(x, 2))

Reading result files: 100%|██████████| 51/51 [01:40<00:00,  1.98s/it]


In [3]:
def get_groups(df):
    geo_groups_df = df.groupby('geo_bits')
    geo_groups = {}
    for geo_group in geo_groups_df.groups:
        geo_df = geo_groups_df.get_group(geo_group)
        geo_groups[geo_group] = geo_df[['time_bits', 'counter_vals']].groupby('time_bits').agg(list)['counter_vals'].to_dict()
    return geo_groups

success_groups = get_groups(df[df['statusMsg'] != "item doesn't exist"])

with open(os.path.join('..', 'figs', 'all_videos', 'all_two_segments_combinations.json'), 'r') as f:
    all_two_segments_combinations = json.load(f)

requested_ids = all_two_segments_combinations['(10, 31)']

In [13]:
import tqdm

valid_ids = []
error_missing_ids = []
missing_ids = []
pbar = tqdm.tqdm(total=sum([len(success_groups[geo_bits]) for geo_bits in success_groups]), desc="Checking missing values")
with httpx.Client() as client:
    # test missing counts
    for geo_bits in success_groups:
        # check if we find a sequence of successful requests with missing vals, where we didn't make the request
        time_groups = success_groups[geo_bits]
        missing_vals_top_contendors = set()
        for time in time_groups:
            pbar.update(1)
            success_vals = time_groups[time]
            min_val = min(success_vals)
            max_val = max(success_vals)
            missing_vals = [val for val in range(min_val, max_val) if val not in success_vals]
            if len(missing_vals) / len(success_vals) < 0.2: # suspicious if less than of vals are missing
                for missing_val in missing_vals:
                    missing_bits = format(missing_val, '08b')
                    all_bits = missing_bits + geo_bits
                    missing_id = int(all_bits, 2)
                    if missing_id not in requested_ids:
                        missing_vals_top_contendors.add(missing_val)
                    else:
                        missing_bits = format(missing_val, '08b')
                        timestamp = math.floor(time)
                        timestamp_bits = format(timestamp, '032b')
                        milliseconds = math.floor((time - timestamp) * 1000)
                        milliseconds_bits = format(milliseconds, '010b')
                        missing_id = int(timestamp_bits + milliseconds_bits + missing_bits + geo_bits, 2)
                        error_missing_ids.append(missing_id)

        for missing_val in missing_vals_top_contendors:
            for time in time_groups:
                success_vals = time_groups[time]
                min_val = min(success_vals)
                max_val = max(success_vals)
                time_missing_vals = [val for val in range(min_val, max_val) if val not in success_vals]
                if missing_val not in time_missing_vals:
                    continue
                if len(time_missing_vals) / len(success_vals) >= 0.2:
                    continue
                # create a tiktok id from these missing vals
                missing_bits = format(missing_val, '08b')
                timestamp = math.floor(time)
                timestamp_bits = format(timestamp, '032b')
                milliseconds = math.floor((time - timestamp) * 1000)
                milliseconds_bits = format(milliseconds, '010b')
                missing_id = int(timestamp_bits + milliseconds_bits + missing_bits + geo_bits, 2)
                missing_ids.append(missing_id)
                # res = get_video(missing_id, client)
                # if 'statusCode' not in res:
                #     valid_ids.append(int(missing_bits + geo_bits, 2))
pbar.close()
print(missing_ids)

Checking missing values: 100%|██████████| 20159/20159 [00:00<00:00, 97384.22it/s]

[7341456322283982086, 7341456322283998470, 7341456322284014854, 7341456322284031238, 7341456322284047622, 7341456322283949318]
[7341456318705716481, 7341456320488361217, 7341456329170521345, 7341456329451588865, 7341456332001660161, 7341456334199540993, 7341456334363151617, 7341456335013285121, 7341456335139065089, 7341456335499939073, 7341456338892934401, 7341456339295587585, 7341456340793036033, 7341456341212450049, 7341456341296368897, 7341456341833108737, 7341456341854080257, 7341456342584118529, 7341456342927887617, 7341456344182066433, 7341456347789118721, 7341456349076753665, 7341456349760359681, 7341456350137879809, 7341456352264391937, 7341456355384937729, 7341456357721148673, 7341456359352716545, 7341456361366031617, 7341456364541234433, 7341456365061278977, 7341456368076950785, 7341456370996301057, 7341456386733198593, 7341456387504983297, 7341456388159212801, 7341456390189288705, 7341456393016200449, 7341456395931290881, 7341456397047008513, 7341456398846283009, 73414564045




: 

In [4]:
request_groups = get_groups(df)

In [7]:
import tqdm

valid_ids = []
error_missing_ids = []
missing_ids = []
pbar = tqdm.tqdm(total=sum([len(request_groups[geo_bits]) for geo_bits in request_groups]), desc="Checking missing values")
# test missing counts
for geo_bits in request_groups:
    # check if we find a sequence of made requests with missing vals, where we didn't make the request
    time_groups = request_groups[geo_bits]
    missing_vals_top_contendors = set()
    for time in time_groups:
        pbar.update(1)
        success_vals = time_groups[time]
        min_val = min(success_vals)
        max_val = max(success_vals)
        missing_vals = [val for val in range(min_val, max_val) if val not in success_vals]
        if len(missing_vals) / len(success_vals) < 0.2: # suspicious if less than of vals are missing
            for missing_val in missing_vals:
                missing_bits = format(missing_val, '08b')
                all_bits = missing_bits + geo_bits
                missing_id = int(all_bits, 2)
                if missing_id not in requested_ids:
                    missing_vals_top_contendors.add(missing_val)
                else:
                    missing_bits = format(missing_val, '08b')
                    timestamp = math.floor(time)
                    timestamp_bits = format(timestamp, '032b')
                    milliseconds = math.floor((time - timestamp) * 1000)
                    milliseconds_bits = format(milliseconds, '010b')
                    missing_id = int(timestamp_bits + milliseconds_bits + missing_bits + geo_bits, 2)
                    error_missing_ids.append(missing_id)

    for missing_val in missing_vals_top_contendors:
        for time in time_groups:
            success_vals = time_groups[time]
            min_val = min(success_vals)
            max_val = max(success_vals)
            time_missing_vals = [val for val in range(min_val, max_val) if val not in success_vals]
            if missing_val not in time_missing_vals:
                continue
            if len(time_missing_vals) / len(success_vals) >= 0.2:
                continue
            # create a tiktok id from these missing vals
            missing_bits = format(missing_val, '08b')
            timestamp = math.floor(time)
            timestamp_bits = format(timestamp, '032b')
            milliseconds = math.floor((time - timestamp) * 1000)
            milliseconds_bits = format(milliseconds, '010b')
            missing_id = int(timestamp_bits + milliseconds_bits + missing_bits + geo_bits, 2)
            missing_ids.append(missing_id)
            
pbar.close()
missing_ids = list(set(missing_ids))
print(len(missing_ids))
print(missing_ids[:20])

Checking missing values: 100%|██████████| 1195973/1195973 [00:08<00:00, 140109.58it/s]

192072
[7341456339111152927, 7341456385818955050, 7341456337777364255, 7341456333209931055, 7341456350389800239, 7341456367569669423, 7341456384749538607, 7341456401929407791, 7341456419109276975, 7341456436289146159, 7341456453469015343, 7341456470648884527, 7341456487828753711, 7341456333775998239, 7341456377816223018, 7341456377149328682, 7341456375148645674, 7341456328440843551, 7341456324439477535, 7341456323772583199]





In [13]:
results = []
with httpx.Client() as client:
    for missing_id in tqdm.tqdm(missing_ids[:1000]):
        try:
            res = get_video(missing_id, client)
            if 'statusCode' not in res:
                results.append(res)
            elif res['statusMsg'] != "item doesn't exist":
                results.append(res)
        except:
            pass
print(results)

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [09:54<00:00,  1.68it/s]

[{'statusCode': 100004, 'statusMsg': 'RPCError{PSM:[tiktok.item.core] Method:[MultiQueryItemDomainByScene] ErrType:[RPC_FAILED] OriginalErr:[remote or network error[remote]: error_code=1115 cds_key=THRIFT_INGRESS|tiktok.web.core:default:useast2a:|tiktok.item.core:default:useast2a:|MultiQueryItemDomainByScene|prod| get connection failed, reason=POOL_FAILURE_RemoteConnectionFailure instance=10.106.25.157,dp-395f1cba93-769cff6dbc-pptmp connect_time=0us] BizStatusCode:[0] BizStatusMessage:[]}'}]



