## Data Cleaning

In [7]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn
import json
import gzip

In [3]:
%matplotlib inline

In [4]:
DATASET_PATH = '../../../datasets'
!ls $DATASET_PATH

books		  books_meta.json.gz  lexile.json
books_lexile.tar  books_ratings.csv   lexile.pkl


### Cleaning 'Lexile Ratings'

In [17]:
lexile_files = []
for i in range(0, 675):
    file_name = DATASET_PATH+'/books/books_'+str(i)+'.txt'
    lexile_files.append(file_name)

In [31]:
frames  = []
for lex_file in lexile_files:
    df = pd.read_json(lex_file)
    df = df[['title_s','englishLexileLevel_s']]
    frames.append(df)
super_df = pd.concat(frames) 
super_df.rename(index=str, columns={'title_s': 'title', 'englishLexileLevel_s': 'lexile'}, inplace=True)
super_df.shape

(48552, 2)

In [43]:
super_df.to_json(path_or_buf=DATASET_PATH+'/lexile.json', orient='records')

#### Checkpoint 1 <--- execute this after restart

In [5]:
lexile_df = pd.read_json(DATASET_PATH+'/lexile.json')

### Cleaning 'Metadata'

In [8]:
%%time

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

metadata_df = getDF(DATASET_PATH+'/books_meta.json.gz');

CPU times: user 6min 17s, sys: 6.95 s, total: 6min 24s
Wall time: 6min 25s


In [16]:
metadata_df.columns

Index([u'asin', u'salesRank', u'imUrl', u'categories', u'title',
       u'description', u'related', u'price', u'brand'],
      dtype='object')

In [21]:
metadata_df = metadata_df[['asin', 'title']]

In [22]:
metadata_df.to_pickle(path=DATASET_PATH+'/books_meta.pkl')

#### Checkpoint 2 <--- Execute this after restart

In [23]:
%%time

metadata_df = pd.read_pickle(DATASET_PATH+'/books_meta.pkl');

CPU times: user 2.16 s, sys: 232 ms, total: 2.4 s
Wall time: 2.4 s


### Merging Lexile and Metadata

In [35]:
print 'MetaData-> Total Titles: ', metadata_df['title'].size, '\tUnique Titles:', metadata_df['title'].unique().size

MetaData-> Total Titles:  2370585 	Unique Titles: 1860814


In [36]:
uni_meta_titles = metadata_df['title'].unique()

In [37]:
print 'Lexile-> Total Titles: ', lexile_df['title'].size, '\tUnique Titles:',lexile_df['title'].unique().size 

Lexile-> Total Titles:  48552 	Unique Titles: 45900


In [38]:
uni_lexile_titles = lexile_df['title'].unique()

In [42]:
common_titles = set(uni_meta_titles).intersection(set(uni_lexile_titles))

In [43]:
len(common_titles)

9910

In [45]:
common_titles

{u'Lily B. on the Brink of Love',
 u'Snow Bear',
 u'Sugar',
 u'The Roar',
 u'Nate the Great and the Big Sniff',
 u'Thank You, Miss Doover',
 u'Marie Antoinette, Serial Killer',
 u"Hank's Story",
 u'Umbrella',
 u'When Ratboy Lived Next Door',
 u'All the Broken Pieces',
 u'New Shoes, Red Shoes',
 u'My Book of Counting',
 u'Red Fox',
 u'Side Effects',
 u"Satchel Paige: Don't Look Back",
 u'Climbing',
 u'Remembering the Titanic',
 u'Not Without Laughter',
 u'Underdogs',
 u"Don't Know Much About Space",
 u'How Weird Is It?',
 u'Fat Cat: A Danish Folktale',
 u'Pups of the Spirit',
 u'You Make Me Smile',
 u'My Childhood Under Fire: A Sarajevo Diary',
 u'Scruffy',
 u"Maggie's Amerikay",
 u'Wolf Storm',
 u'Roxie and the Hooligans',
 u'Haunted Sister',
 u'Here Today',
 u'Starting School with an Enemy',
 u'Rats Saw God',
 u"Alice Ramsey's Grand Adventure",
 u'Franny Parker',
 u'Merlin',
 u'Bubble Bubble',
 u'Will Grayson, Will Grayson',
 u'Ellen Tebbits',
 u'Friendship According to Humphrey',
 u'