# Examples on parsing files using `dictionary` and `json`

The folder images contains Picasso's painting in `jpg` format. 

## Example 1: Parse image path 

The following function parses image path (a string), and return the year (int), title (str), and number (int) that are contained in the name.


In [26]:
def parse_image_path(image_filename):
    '''
    input:
        image_filename - str, name of image file
    output:
        year - int, year of painting
        title - str, name of painting
        num - int, final number on filename
        
    '''
    image_filename = image_filename.split('/')[-1]
    filename_list1 = image_filename.split('-')
    year = int(filename_list1[0])
    num = int(filename_list1[-1].split('.')[0])
    title = "-".join(filename_list1[1:-1])

    return year, title, num


In [11]:
# test function
image_filename = '2022-test.-5.jpg'
image_filename = '1906-Glassware._Still_Life_with_a_Porro.-35.jpg'
image_filename = '1908-Flowers_in_a_Grey_Jug_and__Wine-Glass_with_Spoon.-1.jpg'
image_filename='./images/1906-Glassware._Still_Life_with_a_Porro.-35.jpg'
print(parse_image_path(image_filename))

(1906, 'Glassware._Still_Life_with_a_Porro.', 35)


## Example 2: Create a dictionary containing each path by year

This function looks in a directory and finds the names of all the jpg images inside it, parses the name of the images, and counts how many paintings were painted each year.

In [33]:
def image_year(directory):
    '''
    input:
        directory - directory with images in it
    output:
        year_dict - dict, dictionary of years and the number of paintings made in that year. 
    '''
    # collect 
    import glob
    year_list = []
#     filenames = list(directory.glob('*.jpg'))
    filenames = glob.glob(directory + '*.jpg')
    print(type(filenames))
    for filename in filenames[:]:
        image_name = str(filename).split('\\')[-1]
        image_info = parse_image_path(image_name)
        year = image_info[0]
        year_list.append(year)

    import collections
    year_dict = dict(collections.Counter(year_list)) 
    return year_dict

In [32]:
from pathlib import Path
# directory = Path.cwd() / 'images'
directory = './images/'
image_year(directory)

<class 'list'>


{1895: 2,
 1896: 2,
 1897: 2,
 1899: 1,
 1900: 1,
 1901: 11,
 1902: 1,
 1903: 8,
 1904: 2,
 1905: 12,
 1906: 9,
 1907: 7,
 1908: 13,
 1909: 9,
 1910: 3,
 1911: 1,
 1912: 6,
 1913: 4,
 1914: 2,
 1915: 1,
 1917: 1,
 1918: 2,
 1919: 3,
 1920: 2,
 1921: 4,
 1922: 3}

## Example 3: Count the word occurences in Picasso titles

This function looks in a directory finds the names of all the jpg images inside it, parses the name of the images, and counts the number of occurences of each word. Occurrences of `(`, `)`, `'`, and `,` from the words are stripped before adding to the dictionary.


In [36]:
def image_word_frequency(directory):
    '''
    input:
        directory - str, directory that contains the image files
    output:
        word_dict - dict, {'word' : occurences, ...} - occurence of word in title
    '''
    import glob 
    import string 
    import re
    word_list = []
#     filenames = list(directory.glob('*.jpg'))
    filenames = glob.glob(directory + '*.jpg')
    for filename in filenames[:]:
        image_name = str(filename).split('\\')[-1]
        image_title = parse_image_path(image_name)[1] # read name from parse_image_path
#          image_title_strip = image_title.translate(image_title.maketrans('', '', "().',")) # remove ().,
        image_title_strip = image_title.translate(image_title.maketrans('', '', "()',")) # remove ().,
        image_title_list = re.split(r'_|-',image_title_strip) # split string into a list by _ and - 
#         image_title_list = image_title_strip.split('_')
#         print(image_title_list)
#         if '' in image_title_list:
#             image_title_list.remove('')
        word_list = word_list + image_title_list
    import collections
    word_dict = dict(collections.Counter(word_list)) 
    
    
    return word_dict 

In [37]:
directory = './images/'
image_word_frequency(directory)

{'First': 1,
 'Communion.': 1,
 'The': 16,
 'Barefoot': 1,
 'Girl.': 1,
 'Portrait': 12,
 'of': 20,
 'the': 6,
 'Artists': 1,
 'Mother.': 1,
 'Self': 4,
 'Portrait.': 2,
 'Matador': 1,
 'Luis': 1,
 'Miguel': 1,
 'Dominguin.': 1,
 'Science': 1,
 'and': 16,
 'Charity.': 1,
 'Lola': 1,
 'Picassos': 1,
 'Sister.': 1,
 'A': 1,
 'Spanish': 1,
 'Couple': 1,
 'in': 10,
 'front': 1,
 'an': 1,
 'Inn.': 1,
 'Death': 1,
 'Casagemas.': 1,
 'LAbsinthe.': 1,
 'Leaning': 1,
 'Harlequin.': 3,
 'Le': 1,
 'Gourmet.': 1,
 'Art': 1,
 'Dealer': 1,
 '': 11,
 'Pedro': 1,
 'Manach.': 1,
 'Blue': 2,
 'Period.': 1,
 'Absinthe': 2,
 'Drinker.': 2,
 'Woman': 8,
 'a': 24,
 'Hat.': 1,
 'with': 20,
 'Cigarette.': 1,
 'Chignon.': 1,
 'Visit': 1,
 'Two': 4,
 'Sisters.': 1,
 'Breakfast': 1,
 'Blind': 1,
 'Man.': 1,
 'Lascete.': 1,
 'La': 4,
 'Vie': 1,
 'Life.': 4,
 'Old': 2,
 'Beggar': 1,
 'Boy.': 1,
 'Young': 4,
 'Woman.': 4,
 'Soler.': 1,
 'Guitarist.': 1,
 'Tragedy.': 1,
 'Catalan': 1,
 'Sculptor': 1,
 'Manolo': 1,
 

# Question 4: Read JSON files

In the folder there is a file called `divvy_network.json` that contains data about individual Divvy stations. The following function calculates the the average capacity of bikes across all divvy stations.


In [38]:
def calc_average_station_capacity(filename):
    '''
    input:
        filename - str, name of divvy station file data
    output:
        average_capacity - float, average capacity across all divvy stations
    '''
    
    # with open(Path.cwd() / 'roster_names.json', 'r') as json_file:
    import json
    capacity_list = []
    with open(filename, 'r') as json_file:
        file = json.load(json_file)
        for node in file['nodes']:
            capacity_list.append(node['capacity'])
    import numpy
    average_capacity = numpy.mean(capacity_list)
    return average_capacity


In [39]:
filename = 'divvy_network.json'
calc_average_station_capacity(filename)

16.806020066889634

## Example 5: Write average and standard deviation station capacities to JSON file

The following function calculate the average capacity across stations and the standard deviation in capacity, stores the attributes as a dictionary and writes to a JSON file. The resulted JSON looks like:
```
{
 'mean': NUMBER,
 'std': NUMBER
}
```

In [40]:
def write_station_capacity_attributes(filename, save_filename):
    '''
    input:
        filename: str, divvy file with data
        save_filename: str, name of json file to save
    return:
        save_filename: str, name of json file that you saved data in
    '''
#     save_filename = None 
    import json
    capacity_list = []
    with open(filename, 'r') as json_file:
        file = json.load(json_file)
        for node in file['nodes']:
            capacity_list.append(node['capacity'])
    import numpy
    average_capacity = numpy.mean(capacity_list)
    std_capacity = numpy.std(capacity_list)
    dict_capacity = {}
    dict_capacity['mean'] = average_capacity
    dict_capacity['std'] = std_capacity
    
    with open(save_filename, 'w') as json_file:
        json.dump(dict_capacity, json_file)
        json_file.close()
    return save_filename



In [41]:
filename = 'divvy_network.json'
save_filename = 'divvy_capacity_stats.json'
write_station_capacity_attributes(filename, save_filename)


'divvy_capacity_stats.json'