In [1]:
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np 

In [2]:
import re

In [3]:
from datetime import datetime

In [4]:
with open ("dog.html", "r") as f_:
    result_page = BeautifulSoup(f_, "html.parser")

In [5]:
s = "%s-%s-%s" % (datetime.today().day, datetime.today().month, datetime.today().year)
date = datetime.strptime(s, "%d-%m-%Y")

In [107]:
def normalize(element, t_):
    """
        Title: Normalize function
        Description: parse texts elements and treats it.
    """
    try:
        if t_ == "text":
            return element.text
        if t_ == "date":
            s = element.text.replace(" ", "")
            return datetime.strptime(s, "%d%b%y")
        if t_ == "distance":
            return int(element.text.replace("m", ""))
        if t_ == "only_digits":
            return int(re.sub("\D", "", element.text))
        if t_ == "bends":
            element = element.text.replace("-", "")
            return np.average(np.array(list(element)).astype(int))
        if t_ == "remarks":
            return element.text.lower().split(",")
        if t_ == "float":
            return np.float(element.text)
        if t_ == "int":
            return np.int(element.text)
        if t_ == "by":
            s = element.text
            if len(s) == 1:
                return float(s)
            elif len(s) == 2:
                return float(unicodedata.numeric(s[0])) * float(unicodedata.numeric(s[1]))
            elif len(s) > 2:
                return float(s[:2]) * float(unicodedata.numeric(s[-1]))
    except Exception as e:
        return float("NaN")

In [108]:
stats = list()
for tr in result_page.find("table", {"id":"sortableTable"}).find("tbody").find_all("tr", class_="row"):
    c = tr.find_all("td")
    stats.append([
        normalize(c[0],  "date"),
        normalize(c[1],  "text"),
        normalize(c[2],  "distance"),
        normalize(c[3],  "only_digits"),
        normalize(c[4],  "float"),
        normalize(c[5],  "bends"),
        normalize(c[6],  "only_digits"),
        normalize(c[7],  "by"),
        normalize(c[9],  "remarks"),
        normalize(c[10], "float"),
        normalize(c[12], "float"),
        normalize(c[14], "text"),
        normalize(c[15], "float"),
    ])
df = pd.DataFrame(stats, columns=[
    "date",
    "local",
    "distance",
    "trap",
    "split",
    "bends",
    "position",
    "by",
    "remarks",
    "win_time",
    "weight",
    "grade",
    "cal_time"
])
df = df[df["date"]  < date]


In [109]:
df["split"] = df["split"].fillna(df["split"].mean())
df["bends"] = df["bends"].fillna(df["bends"].mean())
df = df.dropna(subset=["position"], axis=0)

In [110]:
local, distance,grade, trap = "Sland", 450, "A3", 2

In [111]:
s = "4½"

In [112]:
import unicodedata

In [113]:
total = len(df)
df.head(13)

Unnamed: 0,date,local,distance,trap,split,bends,position,by,remarks,win_time,weight,grade,cal_time
0,2019-05-21,Sland,450,3,5.15,5.25,4.0,2.0,[bmp1],28.01,33.2,A3,28.37
1,2019-05-16,Sland,450,2,5.19,2.5,2.0,2.0,"[bmp1, rnon]",27.94,33.0,A3,28.31
2,2019-05-12,Sland,450,2,5.16,1.75,3.0,1.0,[evch],27.64,33.4,A3,28.17
3,2019-05-05,Sland,450,2,5.11,3.0,3.0,0.75,[clrrn],27.87,33.1,A3,28.13
4,2019-04-30,Sland,450,2,5.13,2.75,4.0,4.0,[crd1],27.9,33.2,A3,28.32
5,2019-04-21,Sland,450,2,5.1,3.0,3.0,4.0,[bmp1],27.78,33.6,A3,28.2
6,2019-04-12,Sland,450,2,5.15,4.75,5.0,5.0,[crd1],27.63,32.7,A3,28.5
7,2019-04-04,Sland,450,2,5.17,1.25,1.0,0.5,[ld1],28.24,32.2,A3,28.34
8,2019-03-31,Sland,450,2,5.18,3.0,4.0,2.0,[clrrn],27.98,32.9,A3,28.13
9,2019-03-22,Sland,450,2,5.13,2.75,3.0,2.0,[crd1],27.85,32.8,A3,28.52


In [114]:
# Total of wons at track, distance, trap and grade
stats = {
    "full"       : len(df[(df["local"] == local) & (df["distance"] == distance) & (df["grade"] == grade) & (df["trap"] == trap)]),
    "trap"       : len(df[(df["trap"] == trap)]),
    "distance"   : len(df[(df["distance"] == distance)]),
    "local"      : len(df[(df["local"] == local)]),
    "grade"      : len(df[(df["grade"] == grade)]),
    "dist_trap"  : len(df[(df["distance"] == distance) & (df["trap"] == trap)]),
    "dist_grade" : len(df[(df["distance"] == distance) & (df["grade"] == grade)]),
    "first_full" : len(df[(df["local"] == local) & (df["distance"] == distance) & (df["grade"] == grade) & (df["trap"] == trap) & (df["position"] <= 2)]),
    "first_trap" : len(df[(df["trap"] == trap) & (df["position"] <= 2)]),
    "first_dist" : len(df[(df["distance"] == distance) & (df["position"] <= 2)]),
    "first_local": len(df[(df["local"] == local) & (df["position"] <= 2)]),
    "first_grade": len(df[(df["grade"] == grade) & (df["position"] <= 2)]),
    "tree_full"  : len(df[(df["local"] == local) & (df["distance"] == distance) & (df["grade"] == grade) & (df["trap"] == trap) & (df["position"] >= 3)]),
    "tree_trap"  : len(df[(df["trap"] == trap) & (df["position"] >= 3)]),
    "tree_dist"  : len(df[(df["distance"] == distance) & (df["position"] >= 3)]),
    "tree_local" : len(df[(df["local"] == local) & (df["position"] >= 3)]),
    "tree_grade" : len(df[(df["grade"] == grade) & (df["position"] >= 3)]),  
}

In [96]:
# for value, key in zip(stats.values(), stats.keys()):
#     if total == 0:
#         stats[key] = 0.0
#     else:        
#         stats[key] = round(float(value)/total, 3)

In [141]:
whelping = datetime.strptime(result_page.find("table", class_="pedigree").find_all("td")[-1].text.replace(" ", ""), "%d%b%y")

In [152]:
stats["mean_time"]      = df[(df["position"] != 1) & (df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["cal_time"].mean()
stats["min_time"]       = df[(df["position"] != 1) & (df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["cal_time"].min()
stats["max_time"]       = df[(df["position"] != 1) & (df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["cal_time"].max()
stats["by_mean_lost"]   = df[(df["position"] != 1) & (df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["by"].mean()
stats["by_mean_win"]    = df[(df["position"] == 1) & (df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["by"].mean()
stats["bends_mean"]     = df[(df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["bends"].mean()
stats["position"]       = df[(df["distance"] == distance) & (df["grade"] == grade) & (df["local"] == local)]["position"].mean()
stats["days_lr"]        = float((date - df["date"].iloc[0]).days)
stats["whelping"]       = (date - whelping).days

In [154]:
for value, key in zip(stats.values(), stats.keys()):
    stats[key] = round(float(value), 3)

In [155]:
stats

{'full': 12.0,
 'trap': 23.0,
 'distance': 32.0,
 'local': 32.0,
 'grade': 13.0,
 'dist_trap': 21.0,
 'dist_grade': 13.0,
 'first_full': 2.0,
 'first_trap': 5.0,
 'first_dist': 8.0,
 'first_local': 8.0,
 'first_grade': 2.0,
 'tree_full': 10.0,
 'tree_trap': 18.0,
 'tree_dist': 24.0,
 'tree_local': 24.0,
 'tree_grade': 11.0,
 'mean_time': 28.243,
 'min_time': 27.83,
 'max_time': 28.52,
 'by_mean': 2.562,
 'by_mean_lost': 2.562,
 'by_mean_win': 0.5,
 'bends_mean': 3.135,
 'position': 3.308,
 'days_lr': 2.0,
 'whelping': 1066.0}