# Signatures Features Visualization using Deep Convolutional Autoencoder 

## Signatures Property Value Extraction from WikiData

In [1]:
import bz2
import json
import pandas as pd

In [2]:
def filter_wikidata_with_signatures(json_bz2_file, test_num_line_condition=False, 
                                    num_lines_to_read=10, print_json=False, 
                                    spacing_between_json=5, 
                                    verbose=False):
    """
    
    Filter the wikidata objects which have the signature property 'P109'.
    
    Parameters:
    -----------
    
    json_bz2_file: (string) Path to wikidata JSON file which is zipped in bz2 format.
    
    test_num_line_condition: (bool) Whether to test the condition for num_lines_to_read.
    
    num_lines_to_read: (integer) The number of lines to read from json_bz2_file. 
                       It includes lines which contain and do not contain the P109 property.
                       
    print_json: (bool) Print the JSON which contains the 'P109' property.
    
    spacing_between_json: (integer) The number of new lines to print between successing JSONs 
                          when print_json is True.
                          
    verbose: (bool) Whether to print the 'index of signature', id, name, and 'signature property value' 
             as and when they are filtered.
    
    Returns:
    --------
    
    Pandas DataFrame containing the object id, name and 'signature property value'.
    
    """
    if num_lines_to_read <= 0:
        return
    df = pd.DataFrame()
    wikidata = bz2.BZ2File(json_bz2_file, "r")    
    count = 0
    signature_count = 0
    for line in wikidata:
        if line.startswith("{") and (line.endswith("},\n") or line.endswith("}\n")):
            count += 1
            if "P109" in line:
                if line.endswith("},\n"):
                    line = line[:-2]
                elif line.endswith("}\n"):
                    line = line[:-1]            
                j = json.loads(line)                 
                try:
                    id = j['id']
                    name = j['labels']['en']['value']
                    signature = str(j['claims']['P109'][0]['mainsnak']['datavalue']['value'])   
                    signature_count += 1
                    if verbose:
                        print signature_count, id, name, signature
                    df = df.append([[id, name, signature]])
                except:
                    if test_num_line_condition:                        
                        if count == num_lines_to_read:
                            break
                    continue
                if print_json:
                    print json.dumps(j, sort_keys=True, indent=8, separators=(',', ':'))
                    print "\n" * spacing_between_json
        if test_num_line_condition:            
            if count == num_lines_to_read:
                break
    wikidata.close()
    df.columns = ['id', 'name', 'signature']
    return df

In [None]:
signatures_df = filter_wikidata_with_signatures("./data/wikidata-20161212-all.json.bz2")

In [None]:
signatures_df.to_csv("./data/signatures.csv", index=False, encoding='utf-8')

In [3]:
signatures_df = pd.read_csv("./data/signatures.csv")
signatures_df

Unnamed: 0,id,name,signature
0,Q23,George Washington,George Washington signature.svg
1,Q207,George W. Bush,GeorgeWBush Signature.png
2,Q326,Eduardo Frei Ruiz-Tagle,Firma Frei Ruiz-Tagle.png
3,Q368,Augusto Pinochet,Signature of Augusto Pinochet.svg
4,Q475,Eduardo Frei Montalva,Firma Frei Montalva.svg
5,Q501,Charles Baudelaire,Baudelaire signatur .svg
6,Q815,Gabriel Gonzáles Videla,Firma G. Gonzalez Videla.jpg
7,Q873,Meryl Streep,Meryl Streep Signature.svg
8,Q905,Franz Kafka,Franz Kafka's signature.svg
9,Q909,Jorge Luis Borges.,Jorge Luis Borges firma.svg


## Signature Filenames URL Extraction from WikiData using the Signature Property Values

In [4]:
from urllib import urlopen
from bs4 import BeautifulSoup as bs
import pickle

In [6]:
def extract_signature_url_from_wikidata(signatures_df, verbose=False):
    """
    
    Extract the URLs of the signature files of people.
    
    Parameters:
    -----------
    
    signatures_df: (Pandas Dataframe) columns - id, name, and 'signature property value' 
    
    verbose: (bool) If verbose is True, it prints the 'count of signature' and 'url of signature' 
             as and when they are found. 
             
             
    Returns:
    --------
    
    List of the URLs of the signatures in the order of rows in the signatures_df.
    
    
    """

    count = 0
    signature_url_list = []
    for row in xrange(signatures_df.shape[0]):
        count += 1
        id = signatures_df.iloc[row, 0]
        signature = signatures_df.iloc[row, 2]
        try:
            html = urlopen("https://www.wikidata.org/wiki/" + id)
            bsobj = BeautifulSoup(html.read(), "html")
            links = bsobj.findAll("a", {"class": "extiw"}, href=True)
            for link in links:
                link_content = link.get_text()
                if link_content == signature:
                    link_href = "https:" + link['href']
                    title = link_href[link_href.find("File:") + 5:]                
                    html2 = urlopen(link_href)
                    bsobj2 = BeautifulSoup(html2.read(), "html")
                    links2 = bsobj2.findAll("a", {"class": "internal"}, href=True)
                    link2 = links2[0]
                    signature_url = link2['href']
                    if verbose:
                        print count, signature_url
                    signature_url_list.append(signature_url)                             
                    break
        except BaseException as e:
            print id, signature, e
            
    if count == len(signature_url_list):
        print "Successfully found all file URLs"
    else:
        print "Failure in finding all file URLs"
            
    return signature_url_list
        

In [74]:
signature_url_list = extract_signature_url_from_wikidata(signatures_df, verbose=True)

1 https://upload.wikimedia.org/wikipedia/commons/2/2e/George_Washington_signature.svg
2 https://upload.wikimedia.org/wikipedia/commons/f/fd/GeorgeWBush_Signature.png
3 https://upload.wikimedia.org/wikipedia/commons/5/59/Firma_Frei_Ruiz-Tagle.png
4 https://upload.wikimedia.org/wikipedia/commons/8/8e/Signature_of_Augusto_Pinochet.svg
5 https://upload.wikimedia.org/wikipedia/commons/8/83/Firma_Frei_Montalva.svg
6 https://upload.wikimedia.org/wikipedia/commons/5/53/Baudelaire_signatur_.svg
7 https://upload.wikimedia.org/wikipedia/commons/1/14/Firma_G._Gonzalez_Videla.jpg
8 https://upload.wikimedia.org/wikipedia/commons/6/68/Meryl_Streep_Signature.svg
9 https://upload.wikimedia.org/wikipedia/commons/1/1f/Franz_Kafka%27s_signature.svg
10 https://upload.wikimedia.org/wikipedia/commons/b/b5/Jorge_Luis_Borges_firma.svg
11 https://upload.wikimedia.org/wikipedia/commons/8/8e/Roald_Amundsen_signature.jpg
12 https://upload.wikimedia.org/wikipedia/commons/d/d7/Albert_Einstein_signature_1934.svg
13 h

It is evident that due to network failure 6 links failed to be recorded. So, let's process these missing links manually.

In [81]:
temp_id = ["Q1936526", "Q1960369", "Q1964821", "Q2896509", "Q3570142", "Q5069656"]
temp_signature = ["Rasmus Malling-Hansen signature.jpg", 
                  "Handtekening Vincent van Gogh (1789-1874).jpg", 
                  "Karel VI. Schwarzenberg - podpis.gif", 
                  "Signature feroumont.png", 
                  "Wu Tao.jpg", 
                  "Amayak Kobulov Signature 1939.png"]
temp_name = ["", "", "", "", "", ""]
temp_df = pd.DataFrame()
temp_df['id'] = pd.Series(temp_id)
temp_df['name'] = pd.Series(temp_name)
temp_df['signature'] = pd.Series(temp_signature)
temp_df

Unnamed: 0,id,name,signature
0,Q1936526,,Rasmus Malling-Hansen signature.jpg
1,Q1960369,,Handtekening Vincent van Gogh (1789-1874).jpg
2,Q1964821,,Karel VI. Schwarzenberg - podpis.gif
3,Q2896509,,Signature feroumont.png
4,Q3570142,,Wu Tao.jpg
5,Q5069656,,Amayak Kobulov Signature 1939.png


In [82]:
temp_signature_url_list = extract_signature_url_from_wikidata(temp_df, verbose=True)

1 https://upload.wikimedia.org/wikipedia/commons/b/b8/Rasmus_Malling-Hansen_signature.jpg
2 https://upload.wikimedia.org/wikipedia/commons/3/3b/Handtekening_Vincent_van_Gogh_%281789-1874%29.jpg
3 https://upload.wikimedia.org/wikipedia/commons/d/d9/Karel_VI._Schwarzenberg_-_podpis.gif
4 https://upload.wikimedia.org/wikipedia/commons/d/d1/Signature_feroumont.png
5 https://upload.wikimedia.org/wikipedia/commons/2/21/Wu_Tao.jpg
6 https://upload.wikimedia.org/wikipedia/commons/d/df/Amayak_Kobulov_Signature_1939.png
Successfully found all file URLs


In [92]:
for i in xrange(len(temp_id)):
    index = signatures_df.loc[signatures_df['id'] == temp_id[i]].index[0]
    print index
    signature_url_list.insert(index, temp_signature_url_list[i])

1151
1152
1153
1250
1359
1462


In [94]:
with open("./data/signatures_url_list.pkl", "wb") as p:
    pickle.dump(signature_url_list, p)

In [7]:
with open("./data/signatures_url_list.pkl", "rb") as p:
    signature_url_list = pickle.load(p)

In [8]:
signature_url_list[:5]

['https://upload.wikimedia.org/wikipedia/commons/2/2e/George_Washington_signature.svg',
 'https://upload.wikimedia.org/wikipedia/commons/f/fd/GeorgeWBush_Signature.png',
 'https://upload.wikimedia.org/wikipedia/commons/5/59/Firma_Frei_Ruiz-Tagle.png',
 'https://upload.wikimedia.org/wikipedia/commons/8/8e/Signature_of_Augusto_Pinochet.svg',
 'https://upload.wikimedia.org/wikipedia/commons/8/83/Firma_Frei_Montalva.svg']

In [9]:
len(signature_url_list)

7223

In [98]:
del temp_df, temp_id, temp_name, temp_signature, temp_signature_url_list

## Download Images of Signatures from the found URLs

In [10]:
from urllib import urlretrieve
import os

In [13]:
def download_signatures(original_path_dir, signature_url_list, verbose=False):
    """
    
    Download images of signatures from the list of URLs provided.
    
    Parameters:
    -----------
    
    original_path_dir: (string) path of directory where the files 
                       are to be stored after downloading
                       
    signature_url_list: (list) list of urls of signatures which
                        are to be downloaded
                        
    verbose: (bool) whether to print download updates
    
    Returns:
    --------
    
    retvalue: (integer) 0 if success and 1 if failure
    
    urls_failed_to_download: (list) list of urls which failed due 
                             to some error possibly network connectivity problem
    
    """
    count = 0
    urls_failed_to_download = []    
    for url in signature_url_list:
        count += 1
        filename = url.split("/")[-1]
        path = original_path_dir + os.path.sep + filename
        try:
            if verbose:
                print "Downloading (%d) -> %s" % (count, url)
            urlretrieve(url, filename=path)
        except BaseException as e:
            urls_failed_to_download.append(url)
            print count, url, e
    
    retvalue = 0
    if count == signatures_df.shape[0]:
        print "Successfully downloaded all images of signature"
    else:
        print "Failure to download all images of signatures"
        retvalue = 1
       
    return retvalue, urls_failed_to_download

In [14]:
original_path_dir = "./data/signatures_images/original/"
retvalue, urls_failed_to_download = download_signatures(original_path_dir, 
                                                        signature_url_list,
                                                        verbose=True)

Downloading (1) -> https://upload.wikimedia.org/wikipedia/commons/2/2e/George_Washington_signature.svg
Downloading (2) -> https://upload.wikimedia.org/wikipedia/commons/f/fd/GeorgeWBush_Signature.png
Downloading (3) -> https://upload.wikimedia.org/wikipedia/commons/5/59/Firma_Frei_Ruiz-Tagle.png
Downloading (4) -> https://upload.wikimedia.org/wikipedia/commons/8/8e/Signature_of_Augusto_Pinochet.svg
Downloading (5) -> https://upload.wikimedia.org/wikipedia/commons/8/83/Firma_Frei_Montalva.svg
Downloading (6) -> https://upload.wikimedia.org/wikipedia/commons/5/53/Baudelaire_signatur_.svg
Downloading (7) -> https://upload.wikimedia.org/wikipedia/commons/1/14/Firma_G._Gonzalez_Videla.jpg
Downloading (8) -> https://upload.wikimedia.org/wikipedia/commons/6/68/Meryl_Streep_Signature.svg
Downloading (9) -> https://upload.wikimedia.org/wikipedia/commons/1/1f/Franz_Kafka%27s_signature.svg
Downloading (10) -> https://upload.wikimedia.org/wikipedia/commons/b/b5/Jorge_Luis_Borges_firma.svg
Download

After downloading the images of signatures, I observed that some signatures had text other than signature, contained seals, stamps, photos of face, etc. So, such images were rejected by manually going through all the images. 

In [16]:
accepted_path_dir = "./data/signatures_images/accepted/"
accepted_signature_filenames = os.listdir(accepted_path_dir)

In [28]:
len(accepted_signature_filenames)

6373

I have accepted 6373 images from 7223 images.