In [1]:
#Importing the packages
#pandas for data handling
#matplotlib for visualization
#numpy for crunching numbers
#sklearn and scipy for statistical methods

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
import sklearn
from scipy import stats
%matplotlib inline


In [12]:
#import data from csv file, r to convert string to raw format

df_train = pd.read_csv(r'C:\data\books.csv')

In [11]:
#checking the columns

df_train.columns

Index(['Identifier', 'Edition Statement', 'Place of Publication',
       'Date of Publication', 'Publisher', 'Title', 'Author', 'Contributors',
       'Corporate Author', 'Corporate Contributors', 'Former owner',
       'Engraver', 'Issuance type', 'Flickr URL', 'Shelfmarks'],
      dtype='object')

In [14]:
#Let's have a short look at the data in the different columns using head()
df_train.head()

Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,"A new edition, revised, etc.",London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [13]:
#Depending on what we would like to analyse and how we would like to narrow dimensionality, we can get a view of what
#columns provide the most useful data. Looks like Edition statement, Corporate Author, Corporate Contributors, Former
#owner and Engraver are columns that might be less reliable and informative.

#We can also see that the data in the Place of Publication and Title field is somewhat unstructured  

#Checking for missing data

missing = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
table = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
table.head(30)

Unnamed: 0,Missing,Percent
Engraver,8287,1.0
Corporate Contributors,8287,1.0
Corporate Author,8287,1.0
Former owner,8286,0.999879
Edition Statement,7514,0.906721
Publisher,4195,0.506215
Author,1778,0.214553
Date of Publication,181,0.021841
Shelfmarks,0,0.0
Flickr URL,0,0.0


In [18]:
#Here we can see that for the top three columns, there is no data recorded. Also, for Former owner and Edition statement,
#the available data is very limited. It would not make any sense to try to fill in data in missing fields here. For this
#project, I will drop these columns

df_train.drop(['Engraver', 'Corporate Contributors', 'Corporate Author', 'Former owner', 'Edition Statement'], inplace=True, axis=1)

In [19]:
#Let's check the data again:

df_train.head()

Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Issuance type,Flickr URL,Shelfmarks
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [20]:
#In order to be able to do clean calculations on the Date of Publication, we have to enforce a single format. 
#E.g. changing 1879[1878] to either 1879 or 1878

#Let's check the format of the column:

df_train.dtypes

Identifier               int64
Place of Publication    object
Date of Publication     object
Publisher               object
Title                   object
Author                  object
Contributors            object
Issuance type           object
Flickr URL              object
Shelfmarks              object
dtype: object

In [21]:
#Next step, narrow the date data to the first four digits:

extr = df_train['Date of Publication'].str.extract(r'^(\d{4})', expand=False)
df_train['Date of Publication'] = pd.to_numeric(extr)
df_train['Date of Publication'].dtype


dtype('float64')

In [23]:
#previously, the percentage of missing fields where about 2 percent

df_train['Date of Publication'].isnull().sum() / len (df_train)

0.11717147339205986

In [None]:
#Now, unfortunately, it's at 12 percent. The solution would be to go over the dataset, and check for other solutions than
#just picking the four first letters.