参考链接：https://zhuanlan.zhihu.com/p/369531344

文件操作：https://blog.csdn.net/Baozijiaruqing/article/details/103900387

关于 `VOT` 数据集下载，直接看官方源码：https://github.com/votchallenge/toolkit

下载链接：https://data.votchallenge.net/vot2019/longterm/description.json

In [None]:
!pip freeze > requirements.txt

In [2]:
# !pip install requests pandas
# !pip install -U "urllib3<1.25"

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.ngc.nvidia.com


In [None]:
import os
import requests
from urllib.parse import urlparse, urljoin

VOT_DATASETS = {
    "vot2013": "http://data.votchallenge.net/vot2013/dataset/description.json",
    "vot2014": "http://data.votchallenge.net/vot2014/dataset/description.json",
    "vot2015": "http://data.votchallenge.net/vot2015/dataset/description.json",
    "vot-tir2015": "http://www.cvl.isy.liu.se/research/datasets/ltir/version1.0/ltir_v1_0_8bit.zip",
    "vot2016": "http://data.votchallenge.net/vot2016/main/description.json",
    "vot-tir2016": "http://data.votchallenge.net/vot2016/vot-tir2016.zip",
    "vot2017": "http://data.votchallenge.net/vot2017/main/description.json",
    "vot-st2018": "http://data.votchallenge.net/vot2018/main/description.json",
    "vot-lt2018": "http://data.votchallenge.net/vot2018/longterm/description.json",
    "vot-st2019": "http://data.votchallenge.net/vot2019/main/description.json",
    "vot-lt2019": "http://data.votchallenge.net/vot2019/longterm/description.json",
    "vot-rgbd2019": "http://data.votchallenge.net/vot2019/rgbd/description.json",
    "vot-rgbt2019": "http://data.votchallenge.net/vot2019/rgbtir/meta/description.json",
    "vot-st2020": "https://data.votchallenge.net/vot2020/shortterm/description.json",
    "vot-rgbt2020": "http://data.votchallenge.net/vot2020/rgbtir/meta/description.json",
    "vot-st2021": "https://data.votchallenge.net/vot2021/shortterm/description.json",
    "vot-lt2022": "https://data.votchallenge.net/vot2022/lt/description.json",
    "test": "http://data.votchallenge.net/toolkit/test.zip",
    "segmentation": "http://box.vicos.si/tracking/vot20_test_dataset.zip",
}
stack = "vot-lt2022"
url = VOT_DATASETS[stack]
base_url = url.rsplit("/", 1)[0] + "/"
try:
    meta = requests.get(url).json()
except requests.exceptions.RequestException as e:
    raise Exception("Unable to read JSON file {}".format(e))

color_url = []
groundtruth_url = []
fnames = []
for sequence in meta["sequences"]:
    # get data name
    fnames.append(sequence["name"])

    # get groundtruth zip file
    url = sequence["annotations"]["url"]
    if bool(urlparse(url).netloc):
        gt_url = url
    else:
        gt_url = urljoin(base_url, url)

    # get pic zip file
    url = sequence["channels"]["color"]["url"]
    if bool(urlparse(url).netloc):
        pic_url = url
    else:
        pic_url = urljoin(base_url, url)

    color_url.append(pic_url)
    groundtruth_url.append(gt_url)

for fname, link in zip(fnames, color_url):
    print(f"{fname}: {link}")


In [None]:
# 将数据写入 csv 文件
import csv
import os
import re

import pandas as pd


# 方法一：使用 csv 自带的接口，适合每行长度不一定全相等的情况
def write2csv1(csvfile, fnames, urls):
    if os.path.exists(csvfile):
        print(f"deleting {csvfile}...")
        os.remove(csvfile)

    with open(csvfile, "a+") as csvfile:
        writer = csv.writer(csvfile)
        # 1.写入 columns names
        writer.writerow(["文件名", "下载链接"])
        
        for fname, url in zip(fnames, urls):
            # 2.一行一行写入文件
            # print(f"正在将{fname}: {url} 写入到{csvfile}...\n")
            writer.writerow([fname, url])

# 方法二：使用 pandas
def write2csv2(csvfile, fnames, urls):
    if os.path.exists(csvfile):
        print(f"deleting {csvfile}...")
        os.remove(csvfile)
    
    # 1.创建一个 DataFrame 作为一行写入，以键值对——字典的形式存储
    df = pd.DataFrame({"文件名": fnames, "下载链接": urls})
    # 2.将 DataFrame 存储为 csv 文件，index 表示是否显示行名称（可以是数字，也可以是自定义的字符串）default=True
    df.to_csv(csvfile, index=False, sep=",")


In [None]:
version = re.sub("[^0-9]", "", stack)
csvfile = {
    "color": "votlt" + version + "_color.csv",
    "gt": "votlt" + version + "_gt.csv",
}


def run_writer():
    write2csv1(csvfile["color"], fnames, color_url)
    write2csv1(csvfile["gt"], fnames, groundtruth_url)


run_writer()


In [None]:
import pandas as pd
import time

import requests, os
from tqdm import tqdm

# 屏蔽warning信息
requests.packages.urllib3.disable_warnings()
# 构建自己的代理 IP 池
proxies = {
    "http": "http://127.0.0.1:7890",
    "https": "http://127.0.0.1:7890",
}
# response = requests.get(url, proxies=proxies)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
}

## 初级版：最原始的下载方法
def download_v0(url, folder_path, fname):
    fname = os.path.join(folder_path, f"{fname}.zip")

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)  # 创建存放每一个图片集的单独文件夹

    # if not os.path.exists(fname):
    if not os.path.isfile(fname):
        response = requests.get(url, stream=True, proxies=proxies, headers=headers)
        with open(fname, "wb") as code:
            for chunk in response.iter_content(chunk_size=1024 * 32):  # 边下载边存硬盘
                if chunk:
                    code.write(chunk)
            time.sleep(0.1)
    else:
        print(f"{fname.title()} exists and have totaly been downloaded!")


## 进阶版：使用 tqdm 显示下载进度
def download_v1(url, folder_path, fname):
    fname = os.path.join(folder_path, f"{fname}.zip")

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)  # 创建存放每一个图片集的单独文件夹

    ############# 断点续传实现 ##################
    # 第一次请求是为了得到文件总大小
    response = requests.get(url, stream=True, verify=False, proxies=proxies)
    total_size = int(response.headers["Content-Length"])

    # 文件是否已经存在
    if os.path.isfile(fname):
        temp_size = os.path.getsize(fname)  # 本地已经下载的文件大小
        if temp_size >= total_size:
            print(
                # 注意双引号中不能包括双引号！！！只能使用外面双引号，内部单引号
                f"{fname.split('/')[-2] + '.zip'} exists and have totaly been downloaded!"
            )
            return
    else:
        temp_size = 0

    # 显示一下下载了多少
    print(
        f"{fname.split('/')[-2] + '.zip'} downloaded: {temp_size/(1024*1024):.2f}MB || Total size: {total_size/(1024*1024):.2f}MB || Remaining download rate {1 - temp_size/total_size:.2f}"
    )
    # 核心部分，这个是请求下载时，从本地文件已经下载过的后面下载
    # headers = {'Range': 'bytes=%d-' % temp_size}
    headers = {
        "Range": f"bytes={temp_size}-{total_size}",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
    }
    # 重新请求网址，加入新的请求头的
    response = requests.get(
        url, stream=True, verify=False, headers=headers, proxies=proxies
    )
    ############################################

    with tqdm.wrapattr(
        open(fname, "ab"),  # 以 ab 追加的形式写入！！！
        "write",
        miniters=1,
        # desc=url.split("/")[-1],
        desc=fname.split("/")[-2] + ".zip",
        total=int(response.headers.get("content-length", 0)),
    ) as fout:
        for chunk in response.iter_content(chunk_size=4096):
            if chunk:
                temp_size += len(chunk)
                fout.write(chunk)
                fout.flush()


断点续传参考链接：
- https://blog.csdn.net/qq_35203425/article/details/80987880
- https://blog.csdn.net/thewindkee/article/details/80189434
- https://huyi-aliang.blog.csdn.net/article/details/120926552?spm=1001.2101.3001.6650.1&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1.pc_relevant_paycolumn_v3&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1.pc_relevant_paycolumn_v3&utm_relevant_index=2 这篇比较准确

In [None]:
root = "./VOT2022_LT/sequences"

color_data = pd.read_csv(csvfile["color"], header=0, usecols=[0, 1]).values.tolist()
gt_data = pd.read_csv(csvfile["gt"], header=0, usecols=[0, 1]).values.tolist()


def download_color():
    for item in color_data:
        fname, url = item[0], item[1]
        folder_path = os.path.join(root, fname)
        download_v1(url, folder_path, "color")
        print("Done!")


def download_gt():
    for item in gt_data:
        fname, url = item[0], item[1]
        folder_path = os.path.join(root, fname)
        download_v1(url, folder_path, "groundtruth")
        print("Done!")


# download_gt()
download_color()


In [None]:
# !find -name "*.zip" | xargs rm -r

`Python` 爬虫教程：http://c.biancheng.net/view/2011.html

`Python` 文件读写：
  - http://www.itheima.com/news/20210412/113009.html
  - https://www.cnblogs.com/zdz8207/p/python-updateFile-re-sub.html


`VOT2022-LT`: https://data.votchallenge.net/vot2022/lt/description.json

`sequence` 文件：
```
channels.color=color/%08d.jpg
format=default
fps=30
name=agility

```

In [None]:
# 读写 sequence 文件
sequence = ["channels.color=color/%08d.jpg\r\n", "format=default\r\n", "fps=30\r\n"]

# 测试代码
# source_file = "./test.txt"
# source_file = open(source_file, encoding="utf-8", mode="w")
# source_file.writelines(sequence)
# source_file.flush()
# source_file.close()

for fname in fnames:
    source_file = os.path.join(root, fname, "sequence")
    # print(source_file)
    if not os.path.exists(source_file):
        source_file = open(source_file, encoding="utf-8", mode="w")
        source_file.writelines(sequence)
        source_file.writelines(f"name={fname}\r\n")
        source_file.flush()
        source_file.close()


In [None]:
!tree -L 4

## 补充：有关多进程下载以及下载进度条显示

## 一、下载进度条显示

### Python tqdm 工具包使用

> 官网：https://pypi.org/project/tqdm/#examples-and-advanced-usage

> 有关 `tqdm` 用法参考链接：https://pypi.org/project/tqdm/#examples-and-advanced-usage


```python
import urllib, os
from tqdm import tqdm

eg_link = "https://caspersci.uk.to/matryoshka.zip"
response = getattr(urllib, 'request', urllib).urlopen(eg_link)
with tqdm.wrapattr(open(os.devnull, "wb"), "write",
                   miniters=1, desc=eg_link.split('/')[-1],
                   total=getattr(response, 'length', None)) as fout:
    for chunk in response:
        fout.write(chunk)
```

还可以使用：
```python
import requests, os
from tqdm import tqdm

eg_link = "https://caspersci.uk.to/matryoshka.zip"
response = requests.get(eg_link, stream=True)
with tqdm.wrapattr(open(os.devnull, "wb"), "write",
                   miniters=1, desc=eg_link.split('/')[-1],
                   total=int(response.headers.get('content-length', 0))) as fout:
    for chunk in response.iter_content(chunk_size=4096):
        fout.write(chunk)
```

## 二、Python 多进程下载

## 三、使用 `MD5` 进行文件完整性校验

`MD5` 是一种数据加密手段，但可以通过该值进行完整性校验。

> 参考链接：https://blog.csdn.net/python_neophyte/article/details/102645477

```python
import hashlib
import os


f_path = input('File path: ')
SETUP_FILE = [file for file in os.listdir(f_path) if os.path.splitext(file)[1] == '.bin' or
              (os.path.splitext(file)[1] == '.exe' and '%' not in os.path.splitext(file)[0])]
MD5_FILE = [file for file in os.listdir(f_path) if os.path.splitext(file)[1] == '.md5']

print('所有安装文件：', SETUP_FILE)
print('MD5储存文件：', MD5_FILE)


def get_correct_md5():
    all_md5 = []

    for file in MD5_FILE:
        with open(os.path.join(f_path, file)) as f:
            data = f.readlines()
        all_md5.extend(data)

    return all_md5

def get_file_md5(file):
    full_file_path = os.path.join(f_path, file)
    m = hashlib.md5()
    file_size = '{:.2f}'.format(os.path.getsize(full_file_path) / (1024 ** 2))
    print('正在验证文件名称：%s， 文件大小：%s Mb' % (file, file_size))
    with open(full_file_path, 'rb') as f:
        while True:
            data = f.read(99999999)
            print('验证速度：%.2f Mb/s' % (len(data) / (1024 ** 2)), end='\r')
            if not data:
                break
            m.update(data)
    file_md5 = m.hexdigest().upper()

    return file_md5


def main():
    all_md5 = get_correct_md5()
    bad_file = 0
    print('开始验证：')
    for file in SETUP_FILE:
        md5 = get_file_md5(file)
        for m in all_md5:
            if file in m:
                m = m.split(' ')
                if md5 == m[0]:
                    print(file, '\n验证通过！\n')
                    break
                else:
                    print(file, '\n文件损坏！\n')
                    bad_file += 1
                    break
        else:
            print('此文件没有找到对应的md5，因此跳过验证。')

    print('所有文件验证完成！')

    if bad_file != 0:
        print('共有 %s 个文件损坏，请重新下载损坏文件！' % bad_file)
    else:
        print('所有文件全部通过验证，可以直接安装！')

    return


main()
```

## `Git` 使用教程

- VSCode上传本地项目到github https://www.cxyzjd.com/article/Le___Le/103585617
- https://blog.csdn.net/qq_32578989/article/details/87994300

### Step1: 在 GitHub 创建一个新的仓库，用于存储要提交的项目
```bash
cd workspace
```

### Step2: 与 GitHub 远程仓库建立联系
```bash
git init
git remote rm origin
git remote add origin https://github.com/blainetse/dataset_toolkits.git [ssh/https地址（要保存在 GitHub 的仓库位置）]
git remote -v  # 查看状态
```

### Step3: push 到主分支
```bash
git pull origin master
```

注意：如果项目里已经有东西了，就可能会出现什么远程仓库和本地仓库不相关的错误，所以要
```shell
git pull origin master --allow-unrelated-histories
```
将 README等已有的文件强行拉下来！

```bash
git commit -m 注释内容——说明提交的状态等信息，字符串格式
```

如果有什么 nothing added to commit but untracked files present 的事，就直接 git add xxx.txt 或者 git add xxx/ 或者直接 git add -A 加所有，再 commit 

然后再 push 上去，git push -u origin master

## `Git` 配置问题记录

`ERROR: Repository not found. Fatal: Could not read from remote repository.`
  - https://blog.csdn.net/weixin_40886892/article/details/80725071