In [1]:
import pandas as pd

In [2]:
import json
import pytz
import numpy as np

from urllib.parse import urlparse
from datetime import datetime
from token_info import get_candles_poo_coin
import matplotlib.pyplot as plt

UTC = pytz.timezone('UTC')

In [3]:
#https://www.regextester.com/96504

In [4]:
with open("data/corpus_v3.json", "r") as f:
    data = json.load(f)

In [5]:
import re
hash_pattern = re.compile(r"0x[0-9A-Fa-f]{40}\b", re.DOTALL)
ticker_pattern = re.compile(r"\$[A-Za-z]+", re.DOTALL)
markdown_url_pattern = re.compile(r"(?:\[.*?\])\((.*?)\)", re.DOTALL)
url_pattern = re.compile(r"http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
accepted_domains = set(["exchange.pancakeswap.finance", "bscscan.com", "dextools.io"])

In [6]:
    
def normalize_hashes(hashes):
    if hashes is None:
        return []
    
    return set([_hash.lower() for _hash in hashes])

def is_url_allowed(url):
    try:
        parsed = urlparse(url)
        hostname = parsed.hostname
        return hostname in accepted_domains
    except Exception as e:
        return False

def filter_urls(urls):
    return [url for url in urls if is_url_allowed(url)]

def parse_hashes_from_urls(urls):
    
    hashes = set()
    for url in urls:
        _hash = hash_pattern.search(url)
        
        if _hash is not None:
            hashes.add(_hash.group(0))
    return normalize_hashes(hashes)

def parse_hashes_from_submission(submission):
    text = submission["text"]
    title = submission["title"]

    hashes = normalize_hashes(hash_pattern.findall(text))
    tickers = set(ticker_pattern.findall(title))
    tickers = tickers.update(ticker_pattern.findall(text))
    
    if len(hashes) == 0:
        return set()
        
    urls = markdown_url_pattern.findall(text)
    if len(urls) == 0:
        urls = url_pattern.findall(text)
        
    filtered_urls = filter_urls(urls)
        
    urls_hashes = parse_hashes_from_urls(filtered_urls)
    
    if len(urls_hashes) == 1:
        return urls_hashes
    elif len(urls_hashes) == 0 and len(hashes) == 1:
        return hashes
    elif len(hashes) > 0:
        return hashes

def is_response_valid(payload):
    return payload.get("errors") is None

In [7]:
try:
    with open("data/series_v3.json", "r") as f:
        series = json.load(f)
except:
    series = {}

In [8]:
import time
from tqdm import tqdm

should_brake = False
for submission in tqdm(data):
    delay = False
    _id = submission["id"]
    
    if series.get(_id, {}).get("base_token") is not None:
        continue

    end_time = datetime.now(UTC)
    init_time = datetime.fromtimestamp(submission["created_utc"], UTC)
    
    hashes = parse_hashes_from_submission(submission)
    
    payload = {}
    
    for base_token in hashes:
        
        response = get_candles_poo_coin(base_token, init_time=init_time, end_time=end_time)
        if not is_response_valid(response):
            time.sleep(5)
            print("trying again")
            continue
            
        delay = True
        payload["series"] = response
        payload["base_token"] = base_token
        
    series[_id] = payload
    
    if delay:
        time.sleep(5)

  3%|▎         | 24/713 [01:43<51:50,  4.51s/it]  

trying again


  9%|▊         | 61/713 [04:20<35:45,  3.29s/it]  

trying again


  9%|▉         | 64/713 [04:43<57:11,  5.29s/it]

trying again
trying again


  9%|▉         | 65/713 [04:54<1:14:07,  6.86s/it]

trying again


 11%|█         | 76/713 [05:34<48:56,  4.61s/it]  

trying again


 13%|█▎        | 94/713 [06:46<35:04,  3.40s/it]

trying again


 17%|█▋        | 120/713 [08:15<41:55,  4.24s/it]

trying again


 18%|█▊        | 129/713 [09:03<54:24,  5.59s/it]

trying again
trying again
trying again


 19%|█▊        | 132/713 [09:44<1:28:24,  9.13s/it]

trying again


 19%|█▊        | 133/713 [09:55<1:35:38,  9.89s/it]

trying again


 19%|█▉        | 138/713 [10:28<1:08:43,  7.17s/it]

trying again


 19%|█▉        | 139/713 [10:40<1:21:16,  8.50s/it]

trying again


 20%|█▉        | 142/713 [10:53<59:31,  6.25s/it]  

trying again


 20%|██        | 144/713 [11:06<59:00,  6.22s/it]

trying again


 21%|██        | 151/713 [11:31<37:05,  3.96s/it]

trying again


 24%|██▍       | 172/713 [13:07<52:56,  5.87s/it]

trying again
trying again
trying again


 26%|██▌       | 185/713 [14:31<50:42,  5.76s/it]  

trying again


 32%|███▏      | 227/713 [17:16<22:42,  2.80s/it]  

trying again


 35%|███▍      | 248/713 [18:56<38:28,  4.96s/it]

trying again


 35%|███▍      | 249/713 [19:09<53:32,  6.92s/it]

trying again


 36%|███▌      | 254/713 [19:34<45:10,  5.90s/it]

trying again


 36%|███▋      | 260/713 [20:09<47:30,  6.29s/it]

trying again


 37%|███▋      | 263/713 [20:28<47:40,  6.36s/it]

trying again


 38%|███▊      | 274/713 [21:35<40:14,  5.50s/it]

trying again


 39%|███▉      | 278/713 [21:59<38:07,  5.26s/it]

trying again
trying again


 41%|████      | 294/713 [22:59<30:00,  4.30s/it]

trying again


 42%|████▏     | 302/713 [23:18<18:19,  2.68s/it]

trying again


 42%|████▏     | 303/713 [23:28<26:44,  3.91s/it]

trying again


 44%|████▍     | 317/713 [24:36<32:28,  4.92s/it]

trying again


 47%|████▋     | 332/713 [25:37<19:56,  3.14s/it]

trying again


 52%|█████▏    | 373/713 [29:13<29:36,  5.22s/it]

trying again


 55%|█████▍    | 389/713 [30:33<21:43,  4.02s/it]

trying again


 57%|█████▋    | 404/713 [31:18<12:46,  2.48s/it]

trying again


 57%|█████▋    | 406/713 [31:30<17:23,  3.40s/it]

trying again


 58%|█████▊    | 412/713 [32:03<25:19,  5.05s/it]

trying again


 63%|██████▎   | 449/713 [33:44<11:06,  2.52s/it]

trying again


 64%|██████▍   | 456/713 [34:19<18:55,  4.42s/it]

trying again


 64%|██████▍   | 457/713 [34:34<29:04,  6.81s/it]

trying again


 65%|██████▍   | 460/713 [34:47<22:23,  5.31s/it]

trying again


 67%|██████▋   | 481/713 [35:49<09:05,  2.35s/it]

trying again


 68%|██████▊   | 482/713 [36:01<14:08,  3.67s/it]

trying again


 78%|███████▊  | 553/713 [39:29<13:36,  5.10s/it]

trying again


 79%|███████▉  | 564/713 [40:13<08:31,  3.43s/it]

trying again


 79%|███████▉  | 565/713 [40:18<09:20,  3.79s/it]

trying again


 80%|████████  | 571/713 [40:40<08:36,  3.64s/it]

trying again


 85%|████████▍ | 605/713 [42:23<06:25,  3.57s/it]

trying again


 85%|████████▌ | 607/713 [42:34<07:22,  4.17s/it]

trying again


 87%|████████▋ | 619/713 [43:23<07:21,  4.70s/it]

trying again


 87%|████████▋ | 621/713 [43:29<06:14,  4.08s/it]

trying again
trying again
trying again


 88%|████████▊ | 628/713 [44:08<05:56,  4.20s/it]

trying again


 91%|█████████ | 650/713 [45:00<01:53,  1.81s/it]

trying again


 96%|█████████▌| 681/713 [46:53<01:40,  3.14s/it]

trying again
trying again


100%|██████████| 713/713 [48:32<00:00,  4.09s/it]


In [9]:
with open("data/series_v3.json", "w") as f:
    json.dump(series, f)