In [1]:
import os
import pandas as pd
import numpy as np 
from datareader import read_csv

In [2]:
folder = "/Users/flint/Data/kaggle/books/book-recommender"
metadata_file = "collaborative_book_metadata.csv"
ratings_file = "collaborative_books_df.csv"

def to_float(x):
    return float(x)

def transform(x, f):
    return f(x)

metadata = pd.read_csv(os.path.join(folder, metadata_file), 
                    index_col=0, 
                    header=0, 
                    converters={'ratings_count': to_float})
ratings = pd.read_csv(os.path.join(folder, ratings_file), 
                    index_col=0, 
                    header=0, 
                    converters={'Actual Rating': to_float})

In [3]:
metadata.head(2)

Unnamed: 0,book_id,title,image_url,url,num_pages,ratings_count,description,genre,name,book_id_mapping
0,5899779,Pride and Prejudice and Zombies Pride and Prej...,https://images.gr-assets.com/books/1320449653m...,https://www.goodreads.com/book/show/5899779-pr...,320,105537.0,The New York Times Best Seller is now a major ...,"['fantasy, paranormal', 'romance', 'fiction', ...",Jane Austen,808
1,872333,Blue Bloods Blue Bloods 1,https://images.gr-assets.com/books/1322281515m...,https://www.goodreads.com/book/show/872333.Blu...,302,117633.0,"When the Mayflower set sail in 1620, it carrie...","['young-adult', 'fantasy, paranormal', 'romanc...",Melissa de la Cruz,217


In [4]:
ratings.head(2)

Unnamed: 0,title,book_id,user_id_mapping,book_id_mapping,Predicted Rating,Actual Rating
0,I Am the Messenger,19057,1537,299,4.5,5.0
1,I Am the Messenger,19057,23039,299,4.9,3.0


## Data manipulation
- selezionare la colonna genere
- parsing: dalla `stringa` alla `lista`
- scorrimento delle liste e creazione del mapping: book_id <-> singolo genere

**Selezionare la colonna genere**

In [5]:
genre_strings = metadata['genre'].values

In [6]:
genre_strings[:4]

array(["['fantasy, paranormal', 'romance', 'fiction', 'history, historical fiction, biography', 'young-adult', 'mystery, thriller, crime']",
       "['young-adult', 'fantasy, paranormal', 'romance', 'fiction', 'mystery, thriller, crime']",
       "['romance', 'fiction']",
       "['mystery, thriller, crime', 'fiction']"], dtype=object)

**Parsing**

In [7]:
genre_list = []
for s in genre_strings:
    clean_str = s[2:-2]
    str_list = clean_str.split("', '")
    genre_list.append(str_list)

In [8]:
genre_list[:4]

[['fantasy, paranormal',
  'romance',
  'fiction',
  'history, historical fiction, biography',
  'young-adult',
  'mystery, thriller, crime'],
 ['young-adult',
  'fantasy, paranormal',
  'romance',
  'fiction',
  'mystery, thriller, crime'],
 ['romance', 'fiction'],
 ['mystery, thriller, crime', 'fiction']]

**with list coprehension**

In [None]:
genre_list = [x[2:-2].split("', '") for x in genre_strings]

**Creazione del mapping book <-> genre**

In [9]:
mappings = []
books = metadata['book_id'].values
for i, genres in enumerate(genre_list):
    book = books[i]
    for genre in genres:
        for g in genre.split(', '):
            mappings.append((book, g))

In [10]:
mappings[:4]

[(5899779, 'fantasy'),
 (5899779, 'paranormal'),
 (5899779, 'romance'),
 (5899779, 'fiction')]

In [11]:
book_genres = pd.DataFrame(mappings, columns=['book', 'genre'])

In [12]:
book_genres 

Unnamed: 0,book,genre
0,5899779,fantasy
1,5899779,paranormal
2,5899779,romance
3,5899779,fiction
4,5899779,history
...,...,...
548,29069989,young-adult
549,29069989,children
550,29069989,mystery
551,29069989,thriller


In [13]:
meta = metadata[[x for x in metadata.columns if x not in ['genre']]]

In [15]:
meta.head(2)

Unnamed: 0,book_id,title,image_url,url,num_pages,ratings_count,description,name,book_id_mapping
0,5899779,Pride and Prejudice and Zombies Pride and Prej...,https://images.gr-assets.com/books/1320449653m...,https://www.goodreads.com/book/show/5899779-pr...,320,105537.0,The New York Times Best Seller is now a major ...,Jane Austen,808
1,872333,Blue Bloods Blue Bloods 1,https://images.gr-assets.com/books/1322281515m...,https://www.goodreads.com/book/show/872333.Blu...,302,117633.0,"When the Mayflower set sail in 1620, it carrie...",Melissa de la Cruz,217


In [16]:
ratings.head(2)

Unnamed: 0,title,book_id,user_id_mapping,book_id_mapping,Predicted Rating,Actual Rating
0,I Am the Messenger,19057,1537,299,4.5,5.0
1,I Am the Messenger,19057,23039,299,4.9,3.0


In [17]:
book_genres.head(2)

Unnamed: 0,book,genre
0,5899779,fantasy
1,5899779,paranormal


In [None]:
book_genres[book_genres['book'] == 19057]

## Preparazione per il database 
Vogliamo ottenere uno schema con le seguenti tabelle, assumendo che un libro abbia un solo autore:
```
book(id, title, pages, author, rating_count, description)
rating(book, user, rating)
genre(book, genre)
```

In [48]:
rating_db = ratings[['book_id', 'user_id_mapping', 'Predicted Rating']]
rating_db.columns = ['book', 'user', 'rating']
rating_db.head()

Unnamed: 0,book,user,rating
0,19057,1537,4.5
1,19057,23039,4.9
2,19057,39096,3.9
3,19057,14631,4.7
4,19057,32816,4.3


In [49]:
books_db = meta[['book_id', 'title', 'num_pages', 'ratings_count', 
    'description', 'name']]
books_db.columns = ['id', 'title', 'pages', 'rating_count', 
                    'description', 'author']
books_db.head(2)

Unnamed: 0,id,title,pages,rating_count,description,author
0,5899779,Pride and Prejudice and Zombies Pride and Prej...,320,105537.0,The New York Times Best Seller is now a major ...,Jane Austen
1,872333,Blue Bloods Blue Bloods 1,302,117633.0,"When the Mayflower set sail in 1620, it carrie...",Melissa de la Cruz


## Creazione del database e popolamento delle tabelle
1. inizializzare sqlalchemy engine 
2. connessione col db 
3. scrittura e lettura (tramite pandas)
4. chiusura conessione

In [50]:
from sqlalchemy import create_engine, text

In [51]:
engine = create_engine("sqlite:////Users/flint/Data/kaggle/books/book-recommender/booksdb.sql")

In [52]:
conn = engine.connect()
book_genres.to_sql('genre', con=conn, if_exists='replace', index=False)
rating_db.to_sql('rating', con=conn, if_exists='replace', index=False)
books_db.to_sql('books', con=conn, if_exists='replace', index=False)
conn.close()

## Leggere da DB

In [45]:
sql = """
SELECT B.author, B.pages, G.genre
FROM books AS B JOIN genre AS G
ON B.id = G.book
WHERE B.title = 'Freakonomics A Rogue Economist Explores the Hidden Side of Everything Freakonomics 1'
"""
sql2 = "SELECT COUNT(*) AS rating_num FROM rating"

In [46]:
conn = engine.connect()
query_answer = pd.read_sql(sql=text(sql2), con=conn)
conn.close()

In [47]:
query_answer

Unnamed: 0,rating_num
0,196296
