## Importing the necessary packages and libraries

In [2]:
pip install squarify


The following command must be run outside of the IPython shell:

    $ pip install squarify

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [1]:
import os
import urllib.request
import csv
import pandas as pd 
import numpy as np
import time
from datetime import datetime
import hashlib
import requests
import json
import math
import matplotlib.pyplot as plt
import squarify  #pip install squarify; in manin terminal

ModuleNotFoundError: No module named 'squarify'

### Task 1: Choosing an API

For this project, we have chosen the Marvel comics API that is provided by **www.marvel.com**.

This API and all its documentation is available on https://developer.marvel.com.

It gives **70+** years of information on the vast array of marvel characters which we will use for this project. The API itself allows 3000 calls per day with public and private keys for access. The keys are made available for the user by opening a marvel developer account which we have already done.

In [3]:
public_key = "2b2960219996458e528056a83e42cd75"
private_key = "8dfe66157bc3873de9ad027a60efba8188aeaba7"

### Task 2: Getting the raw data

The Marvel API gives all the information which can be fetched according to characters, comics, creators, events, series and stories. We have taken data from all the types and saved into different csv files. This would then be useful for data manipulation and visulaization.

In this section we will write the functions necessary to compile the data from these different types and store it into different files.

In [None]:
CHARACTER_URL = 'http://gateway.marvel.com/v1/public/characters'
COMIC_URL = 'http://gateway.marvel.com/v1/public/comics'
CREATOR_URL = 'http://gateway.marvel.com/v1/public/creators'

#Timestamp,hash and encodings
def get_link_params():
    ts = time.time();
    ts_str = str(int(ts));
    m_hash = hashlib.md5();
    ts_str_byte = bytes(ts_str, 'utf-8');
    private_key_byte = bytes(private_key, 'utf-8');
    public_key_byte = bytes(public_key, 'utf-8');
    m_hash.update(ts_str_byte + private_key_byte + public_key_byte);
    m_hash_str = str(m_hash.hexdigest());
    return {'ts': ts_str, 'hash': m_hash_str};

def paged_requests_characters(page_size = 100):
    params = {'apikey': '2b2960219996458e528056a83e42cd75', 'limit': page_size}
    hash_params = get_link_params()
    params.update(hash_params)
    resp = requests.get(CHARACTER_URL, params)
    j = resp.json()
    total = j['data']['total']
    page_no = total/page_size
    #print(page_no)
    rounded_page_no= math.ceil(page_no)
    #print(rounded_page_no)
    count=0
    for i in range(rounded_page_no): 
        hash_params = get_link_params()
        params.update(hash_params)
        params.update({'offset': page_size * i}) # offset, how many records to skip
        charac_resp = requests.get(CHARACTER_URL, params)
        j = charac_resp.json()
        #print(j)
        charac_json = j['data']['results']
        #print(charac_json)
        charac_csvwriter = csv.writer(open('characters.csv','a', encoding='utf-8'))
        for a in charac_json:
            if(count==0):
                header = a.keys()
                charac_csvwriter.writerow(header)
                count+=1
            charac_csvwriter.writerow(a.values())
    print('Done')

def paged_requests_comics(page_size = 50):
    params = {'apikey': '2b2960219996458e528056a83e42cd75', 'limit': page_size}
    hash_params = get_link_params()
    params.update(hash_params)
    comic_resp = requests.get(COMIC_URL, params)
    j = comic_resp.json()
    total = j['data']['total']
    print(j['code'])
    print(j['status'])
    page_no = total/page_size
    print(page_no)
    rounded_page_no= math.floor(page_no)
    print(rounded_page_no)
    count=0
#     for i in range(rounded_page_no): 
#         hash_params = get_link_params()
#         params.update(hash_params)
#         params.update({'offset': page_size * i}) # offset, how many records to skip
#         comic_resp = requests.get(COMIC_URL, params)
#         j = comic_resp.json()
#         #print(j)
#         comic_json = j['data']['results']
#         #print(j['code'])
#         #print(j['status'])
#         #print(comic_json)
#         comic_csvwriter = csv.writer(open('comics.csv','a', encoding='utf-8'))
#         for a in comic_json:
#             if(count==0):
#                 header = a.keys()
#                 comic_csvwriter.writerow(header)
#                 count+=1
#             comic_csvwriter.writerow(a.values())
    print('Done')
    
def paged_requests_creators(page_size = 100):
    params = {'apikey': '2b2960219996458e528056a83e42cd75', 'limit': page_size}
    hash_params = get_link_params()
    params.update(hash_params)
    creator_resp = requests.get(CREATOR_URL, params)
    j = creator_resp.json()
    total = j['data']['total']
    page_no = total/page_size
    print(page_no)
    rounded_page_no= math.ceil(page_no)
    print(rounded_page_no)
    count=0
    for i in range(rounded_page_no): 
        hash_params = get_link_params()
        params.update(hash_params)
        params.update({'offset': page_size * i}) # offset, how many records to skip
        creator_resp = requests.get(CREATOR_URL, params)
        j = creator_resp.json()
        #print(j)
        creator_json = j['data']['results']
        #print(creator_json)
        creator_csvwriter = csv.writer(open('creators.csv','a', encoding='utf-8'))
        for a in creator_json:
            if(count==0):
                header = a.keys()
                creator_csvwriter.writerow(header)
                count+=1
            creator_csvwriter.writerow(a.values())
    print('Done')
    
    
paged_requests_characters()
#paged_requests_comics()
paged_requests_creators()



In [106]:
#Reading the data from CSV files
characters = pd.read_csv('characters.csv', index_col = 'id')
characters.head()
characters.drop(['description','thumbnail','resourceURI','urls'], axis = 1, inplace = True)
characters.head()
characters['comics'] = characters['comics'].str.replace("{'available': ",'')
characters['series'] = characters['series'].str.replace("{'available': ",'')
characters['stories'] = characters['stories'].str.replace("{'available': ",'')
characters['events'] = characters['events'].str.replace("{'available': ",'')
characters['comics'] = characters['comics'].apply(lambda x: x.split(',')[0])
characters['series'] = characters['series'].apply(lambda x: x.split(',')[0])
characters['stories'] = characters['stories'].apply(lambda x: x.split(',')[0])
characters['events'] = characters['events'].apply(lambda x: x.split(',')[0])
characters['comics'] = pd.to_numeric(characters['comics'])
characters['series'] = pd.to_numeric(characters['series'])
characters['stories'] = pd.to_numeric(characters['stories'])
characters['events'] = pd.to_numeric(characters['events'])
characters.head()
characters['modified'] = pd.to_datetime(characters['modified'])


current = datetime.utcnow()
print(current)
characters['modified']= [current-t for t in characters['modified']]
characters['total'] = characters['comics'] + characters['series'] + characters['stories'] + characters['events']
characters.head()
characters_popular = characters[characters['total']>2000]
#print(characters_popular)
squarify.plot(sizes=characters_popular['total'], label=characters_popular['name'], alpha=.5 )
plt.axis('off')
plt.show()
# #time.time()
# # characters.shape
# # characters.ndim
# # print(characters.isnull().sum())

Unnamed: 0_level_0,name,modified,comics,series,stories,events
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1011334,3-D Man,2014-04-29T14:18:17-0400,12,3,21,1
1017100,A-Bomb (HAS),2013-09-18T15:54:04-0400,3,1,7,0
1009144,A.I.M.,2013-10-17T14:41:30-0400,49,33,52,0
1010699,Aaron Stack,1969-12-31T19:00:00-0500,14,3,27,0
1009146,Abomination (Emil Blonsky),2012-03-20T12:32:12-0400,53,26,63,1


In [124]:
creators = pd.read_csv('creators.csv', index_col = 'id')
creators.drop(['firstName','middleName', 'lastName', 'suffix','thumbnail','resourceURI','urls'], axis = 1, inplace = True)
creators['comics'] = creators['comics'].str.replace("{'available': ",'')
creators['series'] = creators['series'].str.replace("{'available': ",'')
creators['stories'] = creators['stories'].str.replace("{'available': ",'')
creators['events'] = creators['events'].str.replace("{'available': ",'')
creators['comics'] = creators['comics'].apply(lambda x: x.split(',')[0])
creators['series'] = creators['series'].apply(lambda x: x.split(',')[0])
creators['stories'] = creators['stories'].apply(lambda x: x.split(',')[0])
creators['events'] = creators['events'].apply(lambda x: x.split(',')[0])
creators['comics'] = pd.to_numeric(creators['comics'])
creators['series'] = pd.to_numeric(creators['series'])
creators['stories'] = pd.to_numeric(creators['stories'])
creators['events'] = pd.to_numeric(creators['events'])
creators['total'] = creators['comics'] + creators['series'] + creators['stories'] + creators['events']
creators = creators[creators['total']!=0]
creators['modified'] = pd.to_datetime(creators['modified'],errors = 'coerce')
creators.head()

current = datetime.utcnow()
print(current)
creators['modified']= [current-t for t in creators['modified']]
creators.head()

#print(creators)

2019-03-22 19:45:50.422711


Unnamed: 0_level_0,fullName,modified,comics,series,stories,events,total
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6606,A.R.K.,4462 days 14:45:50.422711,1,1,1,0,3
1168,All Thumbs Creative,241 days 03:55:30.422711,14,14,25,0,53
4592,Arno,4463 days 14:45:50.422711,4,2,4,0,10
3052,Avon,4462 days 14:45:50.422711,2,2,2,0,6
6535,B.K.,4462 days 14:45:50.422711,1,1,1,0,3
