In [None]:
import pandas as pd

In [None]:
import json
import pytz
import numpy as np

from urllib.parse import urlparse
from datetime import datetime
from token_info import get_candles_poo_coin

UTC = pytz.timezone('UTC')

In [None]:
#https://www.regextester.com/96504

In [None]:
with open("data/corpus_v2.json", "r") as f:
    data = json.load(f)

In [None]:
import re
hash_pattern = re.compile(r"0x[0-9A-Fa-f]{40}\b", re.DOTALL)
ticker_pattern = re.compile(r"\$[A-Za-z]+", re.DOTALL)
markdown_url_pattern = re.compile(r"(?:\[.*?\])\((.*?)\)", re.DOTALL)
url_pattern = re.compile(r"http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
accepted_domains = set(["exchange.pancakeswap.finance", "bscscan.com", "dextools.io"])

In [None]:
    
def normalize_hashes(hashes):
    if hashes is None:
        return []
    
    return set([_hash.lower() for _hash in hashes])

def is_url_allowed(url):
    try:
        parsed = urlparse(url)
        hostname = parsed.hostname
        return hostname in accepted_domains
    except Exception as e:
        return False

def filter_urls(urls):
    return [url for url in urls if is_url_allowed(url)]

def parse_hashes_from_urls(urls):
    
    hashes = set()
    for url in urls:
        _hash = hash_pattern.search(url)
        
        if _hash is not None:
            hashes.add(_hash.group(0))
    return normalize_hashes(hashes)

def parse_hashes_from_submission(submission):
    text = submission["text"]
    title = submission["title"]

    hashes = normalize_hashes(hash_pattern.findall(text))
    tickers = set(ticker_pattern.findall(title))
    tickers = tickers.update(ticker_pattern.findall(text))
    
    if len(hashes) == 0:
        return set()
        
    urls = markdown_url_pattern.findall(text)
    if len(urls) == 0:
        urls = url_pattern.findall(text)
        
    filtered_urls = filter_urls(urls)
        
    urls_hashes = parse_hashes_from_urls(filtered_urls)
    
    if len(urls_hashes) == 1:
        return urls_hashes
    elif len(urls_hashes) == 0 and len(hashes) == 1:
        return hashes
    elif len(hashes) > 0:
        return hashes

def is_response_valid(payload):
    return payload.get("errors") is None

In [None]:
try:
    with open("data/series_v2.json", "r") as f:
        series = json.load(f)
except:
    series = {}

In [None]:
import time
from tqdm import tqdm

should_brake = False
for submission in tqdm(data):
    delay = False
    _id = submission["id"]
    
    if series.get(_id, {}).get("base_token") is not None:
        continue

    end_time = datetime.now(UTC)
    init_time = datetime.fromtimestamp(submission["created_utc"], UTC)
    
    hashes = parse_hashes_from_submission(submission)
    
    payload = {}
    
    for base_token in hashes:
        
        response = get_candles_poo_coin(base_token, init_time=init_time, end_time=end_time)
        if not is_response_valid(response):
            time.sleep(5)
            print("trying again")
            continue
            
        delay = True
        payload["series"] = response
        payload["base_token"] = base_token
        
    series[_id] = payload
    
    if delay:
        time.sleep(5)

In [None]:
with open("data/series_v2.json", "w") as f:
    json.dump(series, f)