In [1]:
import pandas as pd
import sqlite3

### Creating SQLite Database

In [2]:
con = sqlite3.connect('library_db.db')
curs = con.cursor()

In [3]:
sql_file = open('sqlite.sql', "r")

In [4]:
def execute_commands(sql_file, sep, database):
    '''
    This function execute all commands of a .sql file.
    It separates the commands using a separator set
    by the user
    '''
    try:
        con = sqlite3.connect(f'{database}')
        curs = con.cursor()
        sql_file = open(f'{sql_file}', "r")
        content_sql = sql_file.read().split(f'{sep}')
        for command in content_sql:
            curs.execute(command)
        print('All commands were succesfully executed.')
    except:
        print('There is something wrong. \nCheck parameters then try again.')

In [5]:
execute_commands(sql_file='sqlite.sql', sep='\n\n', database='library_db.db')

All commands were succesfully executed.


### Dataset

In [47]:
df = pd.read_csv('books.csv', error_bad_lines=False)

b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'


In [50]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

### Function - Applying First Normal Form

In [135]:
def apply_fist_nf(df:pd.DataFrame, feature:str, sep:str):
    '''
    This function transform occurences with more than one value like 'authors: author_1/author_2' into
    occurences with an unique value 'author_1', 'author_2', ensuring first normal form
    '''
    new_df = pd.DataFrame([feature])
    new_df.loc[0, feature] = 'init'
    for i in range(df.shape[0]):
        occurences = df.loc[i, feature].split(sep)
        for occurence in occurences:
            if (occurence not in new_df[feature].unique()):
                new_df.loc[new_df.shape[0], feature] = occurence
    new_df.drop(0, axis=1, inplace=True)
    new_df.drop(0, axis=0, inplace=True)
    new_df.reset_index(inplace=True)
    new_df.drop('index', axis=1, inplace=True)
    return new_df


### Publishers

In [141]:
publisher_df = pd.DataFrame(df['publisher'].unique())
publisher_df.rename(columns={0:'publisher'}, inplace=True)

In [142]:
publisher_df.head()

Unnamed: 0,publisher
0,Scholastic Inc.
1,Scholastic
2,Nimble Books
3,Gramercy Books
4,Del Rey Books


In [None]:
for index, row in error_df.iterrows():
    curs.execute('INSERT INTO error (datetime, machine_id, error_type_id) VALUES (?,?,?)',\
        [row.datetime, row.machineID, row.errorID])
con.commit()

### Author

In [146]:
author_df = pd.DataFrame(df['authors'].unique())
author_df.rename(columns={0:'author'}, inplace=True)

In [147]:
author_df.head()

Unnamed: 0,author
0,J.K. Rowling/Mary GrandPré
1,J.K. Rowling
2,W. Frederick Zimmerman
3,Douglas Adams
4,Douglas Adams/Stephen Fry


In [148]:
author_df = apply_fist_nf(df=author_df, feature='author', sep='/')

In [149]:
author_df.head()

Unnamed: 0,author
0,J.K. Rowling
1,Mary GrandPré
2,W. Frederick Zimmerman
3,Douglas Adams
4,Stephen Fry


### Book 

#### Treating books_df

In [27]:
book_df = pd.read_csv('books.csv', error_bad_lines=False)

b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'


In [28]:
book_df.shape

(11123, 12)

In [29]:
book_df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [30]:
book_df.drop('bookID', axis=1, inplace=True)
book_df.rename(columns={'  num_pages':'num_pages'},inplace=True)

In [32]:
book_df.columns

Index(['title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code',
       'num_pages', 'ratings_count', 'text_reviews_count', 'publication_date',
       'publisher'],
      dtype='object')

### 