## Importing Relevant Modules for Data Processing

In [7]:
import pandas as pd
import json
import re

## Importing Data from JSON file and Storing it as a list of Dictionary

In [8]:
data_lst = []
with open('ol_cdump.json') as f:
    dt = f.readlines()
for elm in dt:
    data_lst.append(json.loads(elm))

## Count the rows in the (raw) data set

In [10]:
len(data_lst)

148163

## Understanding Schema

In [20]:
data_lst[0]

{'latest_revision': 2,
 'revision': 2,
 'title': 'The effect of differentiated marking tools and motivational treatment on figural creativity',
 'languages': [{'key': '/languages/eng'}],
 'subjects': ['Creative thinking -- Testing', 'Educational psychology'],
 'publish_country': 'gau',
 'by_statement': 'by Lillian Rose Arnold',
 'type': {'key': '/type/edition'},
 'location': ['NBuC'],
 'other_titles': ['Marking tools', 'Motivational treatment'],
 'publishers': ['University of Georgia'],
 'last_modified': {'type': '/type/datetime',
  'value': '2009-12-15T08:04:07.512219'},
 'key': '/books/OL22783906M',
 'authors': [{'key': '/authors/OL6535896A'}],
 'publish_places': ['Athens'],
 'oclc_number': ['3954579'],
 'pagination': 'xi, 161 leaves',
 'created': {'type': '/type/datetime', 'value': '2008-12-30T07:38:13.854568'},
 'notes': {'type': '/type/text',
  'value': 'Microfilm of typescript. Ann Arbor, Mich. : University Microfilms, 1975. -- 1 reel ; 35 mm\n\nThesis--University of Georgia\n\nB

## Get the book with the most pages

In [13]:
most = 0
ith = 0
for elm in data_lst:
    if 'number_of_pages' in list(elm.keys()):
        if elm['number_of_pages'] > most:
            most = elm['number_of_pages']
            ith = elm
print(ith['title'])

Nihon shokuminchi kenchikuron


## Get the top 5 geners with most book

In [5]:
most = {}
for elm in data_lst:
    if 'genres' in list(elm.keys()):
        for gen in elm['genres']:
            if gen in list(most.keys()):
                most[gen] = most[gen] + 1
            else:
                most[gen] = 1
pd.Series(most).sort_values(ascending = False).head()

Fiction.                4587
Biography.              3191
Juvenile literature.    2319
Exhibitions.            1129
Juvenile fiction.        714
dtype: int64

## Authors who co authored most books

In [6]:
most_auth = []
for elm in data_lst:
    if 'authors' in list(elm.keys()):
        pat = '/authors/[\w]*'
        temp_lst = re.findall(pat, str(elm['authors']))
        for k in temp_lst:
            most_auth.append(k)
df = pd.DataFrame(most_auth,columns = ['Author']).groupby('Author').agg({'Author': 'count'})
df.columns = ['Author Count']
df.sort_values('Author Count', ascending = False).head(5)

Unnamed: 0_level_0,Author Count
Author,Unnamed: 1_level_1
/authors/OL4945084A,885
/authors/OL1224818A,273
/authors/OL171521A,148
/authors/OL4283462A,119
/authors/OL1926829A,111


## Per publish year, get the number of authors that published at least one book

In [21]:
publish_per_year = []
for elm in data_lst:
    if ('authors' in list(elm.keys())) and ('publish_date' in list(elm.keys())):
        year_List = re.findall('[0-9]{4}$', elm['publish_date'])
        if len(year_List) > 0:
            year = year_List[0]
        pat = '/authors/[\w]*'
        temp_lst = re.findall(pat, str(elm['authors']))
        for k in temp_lst:
            publish_per_year.append((k,year))
#May need to fix future Dates
df = pd.DataFrame(publish_per_year,columns = ['Author','Year']).groupby('Year')['Author'].nunique()\
.reset_index().sort_values('Year', ascending = False)
df[(df['Year'] > '1950') & (df['Year'] <='2021')]

Unnamed: 0,Year,Author
447,2013,2
446,2011,1
445,2010,295
444,2009,2672
443,2008,4210
...,...,...
390,1955,705
389,1954,745
388,1953,685
387,1952,691


## Find the number of authors and number of books published per month for years between 1950 and 1970

In [18]:
publish_per_year_month = []
for elm in data_lst:
    if ('authors' in list(elm.keys())) and ('publish_date' in list(elm.keys())):
        year_List = re.findall('[0-9]{4}$', elm['publish_date'])
        year = None
        if len(year_List) > 0:
            year = year_List[0]
        pat = '/authors/[\w]*'
        temp_lst = re.findall(pat, str(elm['authors']))
        for k in temp_lst:
            if year is not None:
                publish_per_year_month.append((k,year,elm['publish_date']))
df = pd.DataFrame(publish_per_year_month, columns = ['Author', 'Year', 'Publish_Date'])
df = df[(df['Year'].apply(lambda x : int(x)) >=  1950) & (df['Year'].apply(lambda x : int(x))<=  1970)]
df[df['Publish_Date'] == 'Septiembre de 1962'] = 'September 1962' 
df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
df['Month-Year'] = df['Publish_Date'].dt.month.apply(lambda z : str(z))+'-'+ df['Publish_Date'].dt.year.apply(lambda z : str(z))
ans_df = df.groupby('Month-Year').agg({'Author': 'count'})
ans_df['Total_Authors'] = df.groupby('Month-Year')['Author'].nunique()
ans_df.columns = ['Total_books', 'Total_Authors']
ans_df

Unnamed: 0_level_0,Total_books,Total_Authors
Month-Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1-1950,761,716
1-1951,775,741
1-1952,738,690
1-1953,713,682
1-1954,771,740
1-1955,743,700
1-1956,790,757
1-1957,876,832
1-1958,945,900
1-1959,1034,969
