In [505]:
import os
import sys
module_path = os.path.abspath(os.path.join('../spiders/'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [538]:
import pandas as pd
import numpy as np
import copy
import networkx as nx
import matplotlib.pyplot as plt
import logging

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
%matplotlib inline

In [507]:
import categories
import images_downloader
import categories_service
import importlib

importlib.reload(categories)
importlib.reload(images_downloader)
importlib.reload(categories_service)

<module 'categories_service' from '/Users/ccuulinay/github_proj/garbage_classification/spiders/categories_service.py'>

In [508]:
from categories import SH_GARBAGE_CLS_CAT, GZ_GARBAGE_CLS_CAT

In [509]:
sh_names = categories_service.parse_cls_cats(SH_GARBAGE_CLS_CAT)
gz_names = categories_service.parse_cls_cats(GZ_GARBAGE_CLS_CAT)

In [510]:
len(gz_names)

163

In [511]:
_gz_samples = [t for t in gz_names if t[-1] == 'SMLP']

In [512]:
_sh_samples = [t for t in sh_names if t[-1] == 'SMLP']

In [513]:
len(_gz_samples)

130

In [560]:
gz_root_nodes = [n[0] for n in gz_names if len(n) == 1]
sh_root_nodes = [n[0] for n in sh_names if len(n) == 1]

gz_sample_nodes = [t[-2] for t in gz_names if t[-1] == 'SMLP']
sh_sample_nodes = [t[-2] for t in sh_names if t[-1] == 'SMLP']

In [561]:
sh_root_nodes

['可回收物', '有害垃圾', '湿垃圾', '干垃圾', '大件垃圾', '电子废弃物']

In [562]:
def parse_path_list_into_uvd(ops_ls):
    uvd_list = []
    for item in ops_ls:
        if len(item) == 0:
            pass
        elif len(item) == 1:
            pass
        else:
            if item[-1] == "SMLP":
                item = item[:-1]
                v, u = item[-2], item[-1]
                name = "is_sample_of"
                uvd_list.append([u, v, name, "valid", "->".join([u, name, v])])
            for parent, child in zip(item, item[1:]):
                name = "set_relation"
                uvd_list.append([child, parent, name, "子类", "->".join([child, name, parent])])
                uvd_list.append([parent, child, name, "父类", "->".join([parent, name, child])])
    return uvd_list

gz_uvds = parse_path_list_into_uvd(gz_names)
sh_uvds = parse_path_list_into_uvd(sh_names)

In [563]:
gz_uvd_df = pd.DataFrame(gz_uvds, columns=["source", "target", "name", "value", "key"])
sh_uvd_df = pd.DataFrame(sh_uvds, columns=["source", "target", "name", "value", "key"])

gz_uvd_df = gz_uvd_df.drop_duplicates().reset_index(drop=True)
sh_uvd_df = sh_uvd_df.drop_duplicates().reset_index(drop=True)

In [564]:
def build_garbage_MDG(ops_df):
    """
    Using different edges presenting relationship.
    
    
    nx.from_pandas_edgelist(..) is first add edge with (source, target) along with default key,
    and using the return key as key to update edge's attribute which will ignre my "key" in edge_attr.
    
    g = nx.from_pandas_edgelist(
        ops_df, "source", "target", edge_attr=["name", "value", "key"]
        , create_using=nx.MultiDiGraph()
    
    So I would iterate through using g.add_edge.
    """
    g = nx.MultiDiGraph()
    for item in ops_df.itertuples():
        s = getattr(item, "source")
        t = getattr(item, "target")
        n = getattr(item, "name")
        v = getattr(item, "value")
        k = getattr(item, "key")
        g.add_edge(s, t, name=n, value=v, key=k)
    
    return g

def build_garbage_DG(ops_ls):
    """
    Using edge attributes to present a relationship
    """
    garbage_graph = nx.DiGraph()
    for item in ops_ls:
        if len(item) == 0:
            pass
        elif len(item) == 1:
            garbage_graph.add_node(item[0])
        else:
            if item[-1] == "SMLP":
                item = item[:-1]
                sc, sample = item[-2], item[-1]
                garbage_graph.add_edge(sample, sc, is_sample="Y")
            for parent, child in zip(item, item[1:]):
                garbage_graph.add_edge(child, parent, set_relation="子类")
                garbage_graph.add_edge(parent, child, set_relation="父类")
    return garbage_graph


In [565]:
# gz_garbage_graph = build_garbage_DG(gz_names)
# sh_garbage_graph = build_garbage_DG(sh_names)
gz_garbage_mdg = build_garbage_MDG(gz_uvd_df)
sh_garbage_mdg = build_garbage_MDG(sh_uvd_df)

In [566]:
# gz_garbage_graph.get_edge_data("玉米衣", "水果硬壳")
gz_garbage_mdg.get_edge_data("玉米衣", "水果硬壳")

{'玉米衣->is_sample_of->水果硬壳': {'name': 'is_sample_of', 'value': 'valid'},
 '玉米衣->set_relation->水果硬壳': {'name': 'set_relation', 'value': '子类'}}

In [567]:
# Setting nodes attributes for domain root and searchable sample 

for r in gz_root_nodes:
    # print(gz_garbage_mdg.has_node(r))
    if gz_garbage_mdg.has_node(r):
        gz_garbage_mdg.nodes[r]['is_root_domain'] = True
        gz_garbage_mdg.nodes[r]['city'] = "gz"
        
for r in sh_root_nodes:
    # print(gz_garbage_mdg.has_node(r))
    if sh_garbage_mdg.has_node(r):
        sh_garbage_mdg.nodes[r]['is_root_domain'] = True
        sh_garbage_mdg.nodes[r]['city'] = "sh"
        
        
for r in gz_sample_nodes:
    # print(gz_garbage_mdg.has_node(r))
    if gz_garbage_mdg.has_node(r):
        gz_garbage_mdg.nodes[r]['is_searchable_sample'] = True
        gz_garbage_mdg.nodes[r]['city'] = "gz"
        
for r in sh_sample_nodes:
    # print(gz_garbage_mdg.has_node(r))
    if sh_garbage_mdg.has_node(r):
        sh_garbage_mdg.nodes[r]['is_searchable_sample'] = True
        sh_garbage_mdg.nodes[r]['city'] = "sh"

In [568]:
print(gz_garbage_mdg.nodes['可回收物'])
print(sh_garbage_mdg.nodes['可回收物'])
print(gz_garbage_mdg.nodes['LED灯'])
print(sh_garbage_mdg.nodes['废不锈钢'])

{'is_root_domain': True, 'city': 'gz'}
{'is_root_domain': True, 'city': 'sh'}
{'is_searchable_sample': True, 'city': 'gz'}
{'is_searchable_sample': True, 'city': 'sh'}


In [569]:
def get_in_edges_from_dg(ops_g, watch_node_name, watch_edge_attr, watch_edge_attr_val):
    target_node_names = [
        n[0] for n in ops_g.in_edges(watch_node_name, data=watch_edge_attr) if n[-1] == watch_edge_attr_val
    ]
    return target_node_names

def get_out_edges_from_dg(ops_g, watch_node_name, watch_edge_attr, watch_edge_attr_val):
    target_node_names = [
        n[1] for n in ops_g.out_edges(watch_node_name, data=watch_edge_attr) if n[-1] == watch_edge_attr_val
    ]
    return target_node_names


def get_in_edges_from_mdg(ops_g, watch_node_name, watch_edge_name, watch_edge_val, **kwargs):
    if ops_g.is_multigraph() & ops_g.is_directed():
        target_node_names = [
            n[0] for n in ops_g.in_edges(watch_node_name, data=True) if (n[-1].get("name") == watch_edge_name) and (n[-1].get("value") == watch_edge_val) 
        ]
        return target_node_names
    else:
        raise Exception("Input graph is either not a directed graph nor multi graph.")
        
        
def get_out_edges_from_mdg(ops_g, watch_node_name, watch_edge_name, watch_edge_val, **kwargs):
    if ops_g.is_multigraph() & ops_g.is_directed():
        target_node_names = [
            n[1] for n in ops_g.out_edges(watch_node_name, data=True) if (n[-1].get("name") == watch_edge_name) and (n[-1].get("value") == watch_edge_val) 
        ]
        return target_node_names
    else:
        raise Exception("Input graph is either not a directed graph nor multi graph.")

In [570]:
def get_root_domain(ops_g, node_name):
    # Check if given node_name is in the grahp
    if ops_g.has_node(node_name):
        pass
    else:
        logging.error("{} is not in graph.".format(node_name))
        return 
    # Check if given node_name is root domain:
    node_attr = ops_g.nodes[node_name]
    if "is_root_domain" in node_attr.keys():
        if node_attr['is_root_domain']:
            return node_name
    
    # Get out edges 
    else:
        parent_node = get_out_edges_from_mdg(
            ops_g, node_name, "set_relation", "子类"
        )[0]
        return get_root_domain(ops_g, parent_node)

# Test
print(get_root_domain(gz_garbage_mdg, "咖啡渣"))
print(get_root_domain(sh_garbage_mdg, "咖啡渣"))

厨余垃圾
湿垃圾


In [407]:
# nx.get_edge_attributes(gz_garbage_graph,"is_sample")

In [457]:
# Given node, get its parent class
ops_g = gz_garbage_graph
watch_node_name = "骨头贝壳"
watch_edge_attr = "set_relation"
watch_edge_attr_val = "子类"


# target_node_names = [n[0] for n in ops_g.in_edges(watch_node_name, data=watch_attr) if n[-1] == watch_attr_val]
# print(target_node_names)
print(get_in_edges_from_dg(ops_g, watch_node_name, watch_edge_attr, watch_edge_attr_val))
print(get_in_edges_from_mdg(gz_garbage_mdg, watch_node_name, watch_edge_attr, watch_edge_attr_val))

['动物筒骨头骨', '蚝壳', '贝壳', '螺蛳壳']
['动物筒骨头骨', '蚝壳', '贝壳', '螺蛳壳']


In [475]:
# nx.single_source_shortest_path(sh_garbage_mdg, "电子废弃物")

In [474]:
# Check if there is a path to root class
nx.has_path(gz_garbage_mdg,"路面清扫的灰土", "其他垃圾")

True

In [435]:
nx.get_node_attributes(gz_garbage_mdg, "路面清扫的灰土")

{}

In [436]:
nx.single_source_shortest_path(gz_garbage_mdg, "路面清扫的灰土")

{'路面清扫的灰土': ['路面清扫的灰土'],
 '清扫渣土': ['路面清扫的灰土', '清扫渣土'],
 '其他垃圾': ['路面清扫的灰土', '清扫渣土', '其他垃圾'],
 '路面清扫的树叶': ['路面清扫的灰土', '清扫渣土', '路面清扫的树叶'],
 '混杂': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂'],
 '污损': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '污损'],
 '易混淆的纸类': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '易混淆的纸类'],
 '塑料': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '塑料'],
 '废旧衣服及其他纺织品': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '废旧衣服及其他纺织品'],
 '废弃日用品': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '废弃日用品'],
 '骨头贝壳': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '骨头贝壳'],
 '水果硬壳': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '水果硬壳'],
 '坚果': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '坚果'],
 '陶瓷制品': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '陶瓷制品'],
 '猫砂': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '猫砂'],
 '宠物粪便': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '宠物粪便'],
 '烟头': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '烟头'],
 '干燥剂': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '干燥剂'],
 '废弃化妆品': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '废弃化妆品'],
 '毛发': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '毛发'],
 '破损碗碟': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '破损碗碟'],
 '创可贴': ['路面清扫的灰土', '清扫渣土', '其他垃圾', '混杂', '创可贴'],

In [477]:
gz_garbage_mdg.has_node('电子废弃物')

False

In [114]:
# Load existing image files meta table for SH
_sh_images_meta_f = "../_filename_dict.csv"
_sh_images_meta_df = pd.read_csv(_sh_images_meta_f)

In [499]:
_sh_images_meta_df.head()

Unnamed: 0,_filename,_format,sample,spider_id,level0,level1,is_readable_image,is_completed_image
0,干垃圾_编织袋_36.jpeg,.jpeg,编织袋,36,干垃圾,,True,True
1,湿垃圾_食材废料_水产_91.jpg,.jpg,水产,91,湿垃圾,食材废料,True,True
2,湿垃圾_过期食品_肉干_7.jpg,.jpg,肉干,7,湿垃圾,过期食品,True,True
3,干垃圾_橡皮泥_20.jpg,.jpg,橡皮泥,20,干垃圾,,True,True
4,有害垃圾_废含汞温度计、废含汞血压计_水银体温计_84.jpg,.jpg,水银体温计,84,有害垃圾,废含汞温度计、废含汞血压计,True,True
