In [126]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import string
from datetime import datetime, timedelta
import psycopg2
from psycopg2 import sql
from sqlalchemy import create_engine

fake = Faker('id_ID')

data = pd.read_csv('data/data.csv')

In [44]:
data.shape

(48152, 6)

# Dataset libraries

| library_id | PK | INT NOT NULL UNIQUE |
|------------|----|---------------------|
| library_name | | VARCHAR(255) NOT NULL UNIQUE |
| library_location | | VARCHAR(45) NOT NULL UNIQUE |

library_id use 3 digit numbers


In [45]:
libraries_data = {
    'library_id' : [234, 233, 543, 985, 657],
    'library_name' : ['Perpustakaan Jaya I', 'Perpustakaan Jaya II', 'Perpustakaan Jaya III', 'Perpustakaan Jaya IV', 'Perpustakaan Jaya V'],
    'library_location' : ['Jakarta Utara', 'Jakarta Pusat', 'Jakarta Selatan', 'Jakarta Timur', 'Jakarta Barat']
}

libraries = pd.DataFrame(
    data = libraries_data
)

libraries.to_csv('data/libraries.csv', index=False)

# Dataset users

| user_id | PK | INT NOT NULL UNIQUE |
|---------|----|---------------------|
| username|    | VARCHAR(45) NOT NULL UNIQUE |
| password|    | VARCHAR(45) NOT NULL |
| email   |    | VARCHAR(45) NOT NULL UNIQUE |
| name    |    | VARCHAR(45) NOT NULL |

user_id use 10 digit numbers

200 users generated

In [46]:
users = pd.DataFrame()
n_users = 5647

In [47]:
def get_ids():
    return ''.join(random.choice(string.digits[1:]) for _ in range(4))

user_id = []
while len(user_id) != n_users:
    id = get_ids()
    if id not in user_id:
        user_id.append(id)
        

In [48]:
username = []
password = []
email = []
name = []

while len(username) != n_users:
    usn = fake.user_name()
    if usn not in username:
        username.append(usn)
        
while len(email) != n_users:
    mail = fake.email()
    if mail not in email:
        email.append(mail)
        
while len(password) != n_users:
    pwd = fake.password()
    password.append(pwd)
        
while len(name) != n_users:
    names = fake.name()
    if names not in name:
        name.append(names)

In [49]:
users['user_id'] = user_id
users['username'] = username
users['password'] = password
users['email'] = email
users['name'] = name

users.to_csv('data/users.csv', index=False)
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   5647 non-null   object
 1   username  5647 non-null   object
 2   password  5647 non-null   object
 3   email     5647 non-null   object
 4   name      5647 non-null   object
dtypes: object(5)
memory usage: 220.7+ KB


In [50]:
users = users.drop_duplicates(subset='user_id')
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   5647 non-null   object
 1   username  5647 non-null   object
 2   password  5647 non-null   object
 3   email     5647 non-null   object
 4   name      5647 non-null   object
dtypes: object(5)
memory usage: 220.7+ KB


# Dataset books

| book_id | PK | INT NOT NULL UNIQUE |
|---------|----|---------------------|
| title | | VARCHAR(255) NOT NULL |
| author | | VARCHAR(255) NOT NULL |
| publisher | | VARCHAR(255) NOT NULL |
| publish_date | | DATE |
| ISBN | | INT |

book_id use 5 digit numbers

In [51]:
books = pd.DataFrame()

In [52]:
def get_ids():
    return ''.join(random.choice(string.digits[1:]) for n in range(5))

book_id = []
while len(book_id) != 48152:
    id = get_ids()
    if id not in book_id:
        book_id.append(id)

In [53]:
books['book_id'] = book_id
books['title'] = data['title']
books['author'] = data['author']
books['publisher'] = data['publisher']
books['publish_date'] = data['publish_date']
books['ISBN'] = data['ISBN']

books = books.replace('9999999999999', np.nan)

books.to_csv('data/books.csv', index=False)
books.info()
books.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48152 entries, 0 to 48151
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   book_id       48152 non-null  object
 1   title         48152 non-null  object
 2   author        48152 non-null  object
 3   publisher     48152 non-null  object
 4   publish_date  48152 non-null  object
 5   ISBN          45243 non-null  object
dtypes: object(6)
memory usage: 2.2+ MB


Unnamed: 0,book_id,title,author,publisher,publish_date,ISBN
0,97531,The Hunger Games,Suzanne Collins,Scholastic Press,2008-09-14,9780439023481.0
1,54539,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)",Scholastic Inc.,2004-09-28,9780439358071.0
2,74578,To Kill a Mockingbird,Harper Lee,Harper Perennial Modern Classics,2006-05-23,
3,78539,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",Modern Library,2000-10-10,
4,71943,Twilight,Stephenie Meyer,"Little, Brown and Company",2006-09-06,9780316015844.0


In [54]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48152 entries, 0 to 48151
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   book_id       48152 non-null  object
 1   title         48152 non-null  object
 2   author        48152 non-null  object
 3   publisher     48152 non-null  object
 4   publish_date  48152 non-null  object
 5   ISBN          45243 non-null  object
dtypes: object(6)
memory usage: 2.2+ MB


# Dataset book_categories

| book_id | PK | INT NOT NULL UNIQUE |
|---------|----|---------------------|
| categories | | ARRAY NOT NULL |


In [55]:
book_categories = pd.DataFrame()

books = pd.read_csv('data/books.csv')
data = pd.read_csv('data/data.csv')

max_length = data['categories'].apply(len).max()
    
print(max_length)

198


In [56]:
book_categories = pd.DataFrame()

book_categories['book_id'] = books['book_id']
book_categories['categories'] = data['categories']

book_categories.to_csv('data/book_categories.csv', index=False)
book_categories.head()

Unnamed: 0,book_id,categories
0,97531,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas..."
1,54539,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',..."
2,74578,"['Classics', 'Fiction', 'Historical Fiction', ..."
3,78539,"['Classics', 'Fiction', 'Romance', 'Historical..."
4,71943,"['Young Adult', 'Fantasy', 'Romance', 'Vampire..."


# Dataset book_availability

| book_availability_id | PK | INT NOT NULL UNIQUE |
|----------------------|----|---------------------|
| library_id | FK | INT NOT NULL |
| book_id | FK | INT NOT NULL |
| copies | | INT |


In [57]:
book_availability = pd.DataFrame()

In [58]:
libraries = pd.read_csv('data/libraries.csv')
books = pd.read_csv('data/books.csv')

In [59]:
def get_ids():
    return ''.join(random.choice(string.digits[1:]) for n in range(9))

book_availability_id = []
while len(book_availability_id) != 50_000:
    id = get_ids()
    if id not in book_availability_id:
        book_availability_id.append(id)

In [60]:
library_id = []
for _ in range(50_000):
    random_lib = random.choice(libraries['library_id'])
    library_id.append(random_lib)
    
book_id = []
for _ in range(50_000):
    random_b = random.choice(books['book_id'])
    book_id.append(random_b)
    
copies = []
for _ in range(50_000):
    random_c = random.choice(range(1,10))
    copies.append(random_c)

In [61]:
book_availability['book_availability_id'] = book_availability_id
book_availability['library_id'] = library_id
book_availability['book_id'] = book_id
book_availability['copies'] = copies

book_availability.to_csv('data/book_availability.csv', index=False)
book_availability.head()

Unnamed: 0,book_availability_id,library_id,book_id,copies
0,673675739,543,39765,9
1,129619664,543,22359,8
2,932726744,543,97795,1
3,444752824,657,88317,8
4,766562751,543,34442,4


# Dataset loans

| loan_id | PK | INT NOT NULL UNIQUE |
|---------|----|---------------------|
| user_id | FK | INT NOT NULL |
| book_id | FK | INT NOT NULL |
| book_availability_id | FK | INT NOT NULL |
| total_copies | | INT NOT NULL CHECK(>=1 AND <=2) |
| loan_date | | TIMESTAMP NOT NULL |
| due_date | | TIMESTAMP NOT NULL |
| return_date | | TIMESTAMP NOT NULL |


In [62]:
books = pd.read_csv('data/books.csv')
book_availability = pd.read_csv('data/book_availability.csv')
users = pd.read_csv('data/users.csv')

books = pd.merge(books, book_availability, on='book_id')
books = books[books['copies'] < 3]

books.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11079 entries, 2 to 49996
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   book_id               11079 non-null  int64 
 1   title                 11079 non-null  object
 2   author                11079 non-null  object
 3   publisher             11079 non-null  object
 4   publish_date          11079 non-null  object
 5   ISBN                  10366 non-null  object
 6   book_availability_id  11079 non-null  int64 
 7   library_id            11079 non-null  int64 
 8   copies                11079 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 865.5+ KB


## January

In [63]:
jan_loan = pd.DataFrame()
jan_loaners = random.randint(50,100)

def generate_fake_timestamp():
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 1, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        
loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']


jan_loan['user_id'] = loan_user_id
jan_loan['book_id'] = loan_book_id

jan_loan = pd.merge(jan_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
jan_loan = jan_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(jan_loan))]

jan_loan['loan_date'] = loan_date
jan_loan['loan_date'] = pd.to_datetime(jan_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

jan_loan = jan_loan.sort_values(by=['loan_date'], ascending=True)
jan_loan = jan_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
jan_loan['due_date'] = jan_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

jan_loan['return_date'] = jan_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
jan_loan['return_date'] = jan_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

jan_loan = jan_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
jan_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 73 to 88
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               91 non-null     int32         
 1   book_id               91 non-null     int32         
 2   book_availability_id  91 non-null     int32         
 3   total_copies          91 non-null     int32         
 4   loan_date             91 non-null     datetime64[ns]
 5   due_date              91 non-null     datetime64[ns]
 6   return_date           91 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.3+ KB


In [64]:
jan_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
73,6999,16232,284787677,1,2023-01-01 09:14:07,2023-01-15 09:14:07,2023-01-13 18:10:08
22,2719,25268,277377247,1,2023-01-01 21:05:13,2023-01-15 21:05:13,2023-01-14 22:18:31
89,4748,98926,897722137,1,2023-01-02 16:46:50,2023-01-16 16:46:50,2023-01-11 19:12:38
76,3844,64575,134435244,2,2023-01-02 16:48:50,2023-01-16 16:48:50,2023-01-16 00:44:48
19,2499,66538,755748586,1,2023-01-02 17:42:29,2023-01-16 17:42:29,2023-01-05 04:09:58


## February

In [65]:
feb_loan = pd.DataFrame()
feb_loaners = random.randint(50,100)

def generate_fake_timestamp():
    start_date = datetime(2023, 2, 1)
    end_date = datetime(2023, 2, 28)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        
loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

feb_loan['user_id'] = loan_user_id
feb_loan['book_id'] = loan_book_id

feb_loan = pd.merge(feb_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
feb_loan = feb_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(feb_loan))]

feb_loan['loan_date'] = loan_date
feb_loan['loan_date'] = pd.to_datetime(feb_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

feb_loan = feb_loan.sort_values(by=['loan_date'], ascending=True)
feb_loan = feb_loan.drop_duplicates(subset=['book_id', 'book_availability_id'], keep='first')
feb_loan['due_date'] = feb_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

feb_loan['return_date'] = feb_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
feb_loan['return_date'] = feb_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

feb_loan = feb_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
feb_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, 26 to 51
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               93 non-null     int32         
 1   book_id               93 non-null     int32         
 2   book_availability_id  93 non-null     int32         
 3   total_copies          93 non-null     int32         
 4   loan_date             93 non-null     datetime64[ns]
 5   due_date              93 non-null     datetime64[ns]
 6   return_date           93 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.4+ KB


In [66]:
feb_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
26,4378,63364,519614751,1,2023-02-01 07:58:23,2023-02-15 07:58:23,2023-02-13 10:48:26
64,2716,52627,755634141,2,2023-02-01 08:00:09,2023-02-15 08:00:09,2023-02-08 10:27:50
35,4126,17972,732781188,1,2023-02-01 11:18:34,2023-02-15 11:18:34,2023-02-14 22:18:39
86,1478,97896,972312153,2,2023-02-01 13:44:57,2023-02-15 13:44:57,2023-02-05 17:37:09
19,7461,85543,497428773,2,2023-02-01 14:50:01,2023-02-15 14:50:01,2023-02-14 13:34:20


## March

In [67]:
mar_loan = pd.DataFrame()
mar_loaners = random.randint(50,100)

def generate_fake_timestamp():
    start_date = datetime(2023, 3, 1)
    end_date = datetime(2023, 3, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        
loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

mar_loan['user_id'] = loan_user_id
mar_loan['book_id'] = loan_book_id

mar_loan = pd.merge(mar_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
mar_loan = mar_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(mar_loan))]

mar_loan['loan_date'] = loan_date
mar_loan['loan_date'] = pd.to_datetime(mar_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

mar_loan = mar_loan.sort_values(by=['loan_date'], ascending=True)
mar_loan = mar_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
mar_loan['due_date'] = mar_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

mar_loan['return_date'] = mar_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
mar_loan['return_date'] = mar_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

mar_loan = mar_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
mar_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 73 to 26
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               91 non-null     int32         
 1   book_id               91 non-null     int32         
 2   book_availability_id  91 non-null     int32         
 3   total_copies          91 non-null     int32         
 4   loan_date             91 non-null     datetime64[ns]
 5   due_date              91 non-null     datetime64[ns]
 6   return_date           91 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.3+ KB


In [68]:
mar_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
73,4239,77281,637357317,2,2023-03-01 07:15:02,2023-03-15 07:15:02,2023-03-10 14:30:05
37,2493,98946,175847879,1,2023-03-01 08:07:02,2023-03-15 08:07:02,2023-03-07 11:41:29
72,8744,27315,565924248,1,2023-03-01 11:45:58,2023-03-15 11:45:58,2023-03-04 18:34:09
25,2919,13862,534793462,2,2023-03-01 12:49:05,2023-03-15 12:49:05,2023-03-12 03:51:00
52,4419,14278,571479397,1,2023-03-02 05:18:19,2023-03-16 05:18:19,2023-03-13 10:32:11


## April

In [69]:
apr_loan = pd.DataFrame()
apr_loaners = random.randint(50,100)

def generate_fake_timestamp():
    start_date = datetime(2023, 4, 1)
    end_date = datetime(2023, 4, 30)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        
loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

apr_loan['user_id'] = loan_user_id
apr_loan['book_id'] = loan_book_id

apr_loan = pd.merge(apr_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
apr_loan = apr_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(apr_loan))]

apr_loan['loan_date'] = loan_date
apr_loan['loan_date'] = pd.to_datetime(apr_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

apr_loan = apr_loan.sort_values(by=['loan_date'], ascending=True)
apr_loan = apr_loan.drop_duplicates(subset=['book_id', 'book_availability_id'], keep='first')
apr_loan['due_date'] = apr_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

apr_loan['return_date'] = apr_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
apr_loan['return_date'] = apr_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

apr_loan = apr_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
apr_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 4 to 83
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               91 non-null     int32         
 1   book_id               91 non-null     int32         
 2   book_availability_id  91 non-null     int32         
 3   total_copies          91 non-null     int32         
 4   loan_date             91 non-null     datetime64[ns]
 5   due_date              91 non-null     datetime64[ns]
 6   return_date           91 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.3+ KB


In [70]:
apr_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
4,3936,86448,584488222,2,2023-04-01 04:09:52,2023-04-15 04:09:52,2023-04-13 22:23:16
47,2157,65541,195578735,1,2023-04-01 07:20:49,2023-04-15 07:20:49,2023-04-08 01:36:47
52,7444,29929,726957125,1,2023-04-02 01:09:52,2023-04-16 01:09:52,2023-04-04 07:30:29
87,7412,25995,667746764,2,2023-04-02 06:05:26,2023-04-16 06:05:26,2023-04-09 09:44:54
70,4957,29659,737844488,2,2023-04-02 07:45:28,2023-04-16 07:45:28,2023-04-16 07:17:51


## May

In [71]:
may_loan = pd.DataFrame()
may_loaners = random.randint(50,100)
        
def generate_fake_timestamp():
    start_date = datetime(2023, 5, 1)
    end_date = datetime(2023, 5, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp

loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

may_loan['user_id'] = loan_user_id
may_loan['book_id'] = loan_book_id

may_loan = pd.merge(may_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
may_loan = may_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(may_loan))]

may_loan['loan_date'] = loan_date
may_loan['loan_date'] = pd.to_datetime(may_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

may_loan = may_loan.sort_values(by=['loan_date'], ascending=True)
may_loan = may_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
may_loan['due_date'] = may_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

may_loan['return_date'] = may_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
may_loan['return_date'] = may_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

may_loan = may_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
may_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 64 to 57
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               91 non-null     int32         
 1   book_id               91 non-null     int32         
 2   book_availability_id  91 non-null     int32         
 3   total_copies          91 non-null     int32         
 4   loan_date             91 non-null     datetime64[ns]
 5   due_date              91 non-null     datetime64[ns]
 6   return_date           91 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.3+ KB


In [72]:
may_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
64,8644,87279,754371786,2,2023-05-01 02:47:34,2023-05-15 02:47:34,2023-05-02 14:35:11
18,8952,14582,176933422,2,2023-05-01 12:21:05,2023-05-15 12:21:05,2023-05-14 00:05:09
39,7379,23978,468732731,1,2023-05-01 13:29:18,2023-05-15 13:29:18,2023-05-13 00:22:30
19,5268,94593,238665433,2,2023-05-01 18:12:05,2023-05-15 18:12:05,2023-05-13 15:12:41
15,1671,18288,828444462,1,2023-05-01 23:13:12,2023-05-15 23:13:12,2023-05-08 08:40:54


## June

In [73]:
jun_loan = pd.DataFrame()
jun_loaners = random.randint(50,100)
        
def generate_fake_timestamp():
    start_date = datetime(2023, 6, 1)
    end_date = datetime(2023, 6, 30)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp

loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

jun_loan['user_id'] = loan_user_id
jun_loan['book_id'] = loan_book_id

jun_loan = pd.merge(jun_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
jun_loan = jun_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(jun_loan))]

jun_loan['loan_date'] = loan_date
jun_loan['loan_date'] = pd.to_datetime(jun_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

jun_loan = jun_loan.sort_values(by=['loan_date'], ascending=True)
jun_loan = jun_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
jun_loan['due_date'] = jun_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

jun_loan['return_date'] = jun_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
jun_loan['return_date'] = jun_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

jun_loan = jun_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
jun_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 94 entries, 62 to 80
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               94 non-null     int32         
 1   book_id               94 non-null     int32         
 2   book_availability_id  94 non-null     int32         
 3   total_copies          94 non-null     int32         
 4   loan_date             94 non-null     datetime64[ns]
 5   due_date              94 non-null     datetime64[ns]
 6   return_date           94 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.4+ KB


In [74]:
jun_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
62,9645,77281,459594237,1,2023-06-01 05:34:26,2023-06-15 05:34:26,2023-06-05 03:25:21
35,4529,18744,236373627,1,2023-06-01 08:02:57,2023-06-15 08:02:57,2023-06-14 09:50:16
48,1394,71548,993682987,1,2023-06-01 14:57:44,2023-06-15 14:57:44,2023-06-06 02:16:08
15,5416,24774,791967452,2,2023-06-01 20:11:24,2023-06-15 20:11:24,2023-06-09 04:54:07
6,6392,42995,966859147,1,2023-06-02 03:47:15,2023-06-16 03:47:15,2023-06-12 17:29:12


## July

In [75]:
jul_loan = pd.DataFrame()
jul_loaners = random.randint(50,100)

def generate_fake_timestamp():
    start_date = datetime(2023, 7, 1)
    end_date = datetime(2023, 7, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        
loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

jul_loan['user_id'] = loan_user_id
jul_loan['book_id'] = loan_book_id

jul_loan = pd.merge(jul_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
jul_loan = jul_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(jul_loan))]

jul_loan['loan_date'] = loan_date
jul_loan['loan_date'] = pd.to_datetime(jul_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

jul_loan = jul_loan.sort_values(by=['loan_date'], ascending=True)
jul_loan = jul_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
jul_loan['due_date'] = jul_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

jul_loan['return_date'] = jul_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
jul_loan['return_date'] = jul_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

jul_loan = jul_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
jul_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92 entries, 63 to 24
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               92 non-null     int32         
 1   book_id               92 non-null     int32         
 2   book_availability_id  92 non-null     int32         
 3   total_copies          92 non-null     int32         
 4   loan_date             92 non-null     datetime64[ns]
 5   due_date              92 non-null     datetime64[ns]
 6   return_date           92 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.3+ KB


In [76]:
jul_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
63,1694,65383,636757197,2,2023-07-01 19:49:11,2023-07-15 19:49:11,2023-07-03 16:52:06
29,8277,73968,973761964,2,2023-07-03 05:06:10,2023-07-17 05:06:10,2023-07-12 01:55:56
26,3362,77547,858784251,1,2023-07-03 06:16:34,2023-07-17 06:16:34,2023-07-10 09:46:55
49,4885,71131,551628112,1,2023-07-04 00:20:52,2023-07-18 00:20:52,2023-07-17 12:53:58
52,7246,72513,516867531,1,2023-07-04 11:59:07,2023-07-18 11:59:07,2023-07-17 15:35:17


## August

In [77]:
aug_loan = pd.DataFrame()
aug_loaners = random.randint(50,100)

def generate_fake_timestamp():
    start_date = datetime(2023, 8, 1)
    end_date = datetime(2023, 8, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        
loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

aug_loan['user_id'] = loan_user_id
aug_loan['book_id'] = loan_book_id

aug_loan = pd.merge(aug_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
aug_loan = aug_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(aug_loan))]

aug_loan['loan_date'] = loan_date
aug_loan['loan_date'] = pd.to_datetime(aug_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

aug_loan = aug_loan.sort_values(by=['loan_date'], ascending=True)
aug_loan = aug_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
aug_loan['due_date'] = aug_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

aug_loan['return_date'] = aug_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
aug_loan['return_date'] = aug_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

aug_loan = aug_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
aug_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 27 to 80
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               89 non-null     int32         
 1   book_id               89 non-null     int32         
 2   book_availability_id  89 non-null     int32         
 3   total_copies          89 non-null     int32         
 4   loan_date             89 non-null     datetime64[ns]
 5   due_date              89 non-null     datetime64[ns]
 6   return_date           89 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.2+ KB


In [78]:
aug_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
27,6851,44568,136231175,1,2023-08-01 01:00:29,2023-08-15 01:00:29,2023-08-08 16:10:18
50,4143,97134,781456463,1,2023-08-01 14:54:20,2023-08-15 14:54:20,2023-08-05 03:45:15
21,8581,51179,872675938,1,2023-08-02 10:22:31,2023-08-16 10:22:31,2023-08-07 05:36:07
64,5625,57234,394744514,2,2023-08-02 16:20:15,2023-08-16 16:20:15,2023-08-14 11:54:14
0,3197,81177,813595951,2,2023-08-03 06:18:42,2023-08-17 06:18:42,2023-08-10 01:10:36


## September

In [79]:
sep_loan = pd.DataFrame()
sep_loaners = random.randint(50,100)
        
def generate_fake_timestamp():
    start_date = datetime(2023, 9, 1)
    end_date = datetime(2023, 9, 30)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp

loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

sep_loan['user_id'] = loan_user_id
sep_loan['book_id'] = loan_book_id

sep_loan = pd.merge(sep_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
sep_loan = sep_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(sep_loan))]

sep_loan['loan_date'] = loan_date
sep_loan['loan_date'] = pd.to_datetime(sep_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

sep_loan = sep_loan.sort_values(by=['loan_date'], ascending=True)
sep_loan = sep_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
sep_loan['due_date'] = sep_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

sep_loan['return_date'] = sep_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
sep_loan['return_date'] = sep_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

sep_loan = sep_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
sep_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101 entries, 30 to 61
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               101 non-null    int32         
 1   book_id               101 non-null    int32         
 2   book_availability_id  101 non-null    int32         
 3   total_copies          101 non-null    int32         
 4   loan_date             101 non-null    datetime64[ns]
 5   due_date              101 non-null    datetime64[ns]
 6   return_date           101 non-null    object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.7+ KB


In [80]:
sep_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
30,3826,35773,491676974,2,2023-09-01 17:54:58,2023-09-15 17:54:58,2023-09-14 20:56:55
62,4298,77378,689786581,2,2023-09-01 20:06:19,2023-09-15 20:06:19,2023-09-06 09:54:22
26,7212,93144,497119379,2,2023-09-01 22:24:31,2023-09-15 22:24:31,2023-09-08 05:31:57
69,2451,85685,232619829,1,2023-09-02 02:45:57,2023-09-16 02:45:57,2023-09-04 23:38:05
46,3122,55139,757245761,1,2023-09-02 16:39:38,2023-09-16 16:39:38,2023-09-15 16:14:37


## October

In [81]:
oct_loan = pd.DataFrame()
oct_loaners = random.randint(50,100)
        
def generate_fake_timestamp():
    start_date = datetime(2023, 10, 1)
    end_date = datetime(2023, 10, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        

loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

oct_loan['user_id'] = loan_user_id
oct_loan['book_id'] = loan_book_id

oct_loan = pd.merge(oct_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
oct_loan = oct_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(oct_loan))]

oct_loan['loan_date'] = loan_date
oct_loan['loan_date'] = pd.to_datetime(oct_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

oct_loan = oct_loan.sort_values(by=['loan_date'], ascending=True)
oct_loan = oct_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
oct_loan['due_date'] = oct_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

oct_loan['return_date'] = oct_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
oct_loan['return_date'] = oct_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

oct_loan = oct_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
oct_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90 entries, 63 to 89
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               90 non-null     int32         
 1   book_id               90 non-null     int32         
 2   book_availability_id  90 non-null     int32         
 3   total_copies          90 non-null     int32         
 4   loan_date             90 non-null     datetime64[ns]
 5   due_date              90 non-null     datetime64[ns]
 6   return_date           90 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.2+ KB


In [82]:
oct_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
63,5176,94534,937474972,2,2023-10-01 00:46:26,2023-10-15 00:46:26,2023-10-07 11:25:22
69,4399,11295,655888817,1,2023-10-01 07:06:44,2023-10-15 07:06:44,2023-10-11 12:35:23
0,5466,22221,761141255,1,2023-10-01 15:37:02,2023-10-15 15:37:02,2023-10-15 06:09:24
66,5991,14764,877744862,1,2023-10-01 19:59:02,2023-10-15 19:59:02,2023-10-03 14:28:14
1,8989,18875,716255249,2,2023-10-01 20:24:38,2023-10-15 20:24:38,2023-10-05 00:02:59


## November

In [83]:
nov_loan = pd.DataFrame()
nov_loaners = random.randint(50,100)
        
def generate_fake_timestamp():
    start_date = datetime(2023, 11, 1)
    end_date = datetime(2023, 11, 30)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        

loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

nov_loan['user_id'] = loan_user_id
nov_loan['book_id'] = loan_book_id

nov_loan = pd.merge(nov_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
nov_loan = nov_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(nov_loan))]

nov_loan['loan_date'] = loan_date
nov_loan['loan_date'] = pd.to_datetime(nov_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

nov_loan = nov_loan.sort_values(by=['loan_date'], ascending=True)
nov_loan = nov_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
nov_loan['due_date'] = nov_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

nov_loan['return_date'] = nov_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
nov_loan['return_date'] = nov_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

nov_loan = nov_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
nov_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 26 to 19
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               91 non-null     int32         
 1   book_id               91 non-null     int32         
 2   book_availability_id  91 non-null     int32         
 3   total_copies          91 non-null     int32         
 4   loan_date             91 non-null     datetime64[ns]
 5   due_date              91 non-null     datetime64[ns]
 6   return_date           91 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.3+ KB


In [84]:
nov_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
26,8257,63755,397153242,2,2023-11-01 00:58:18,2023-11-15 00:58:18,2023-11-14 07:30:57
34,8333,88617,593794718,1,2023-11-02 04:11:38,2023-11-16 04:11:38,2023-11-08 14:37:11
72,3588,13687,945428136,1,2023-11-02 10:59:27,2023-11-16 10:59:27,2023-11-11 22:57:07
55,7731,11847,582794552,2,2023-11-04 02:09:07,2023-11-18 02:09:07,2023-11-10 16:16:18
0,3654,67115,631281638,2,2023-11-04 06:04:50,2023-11-18 06:04:50,2023-11-07 03:46:27


## December

In [85]:
dec_loan = pd.DataFrame()
dec_loaners = random.randint(50,100)
        
def generate_fake_timestamp():
    start_date = datetime(2023, 12, 1)
    end_date = datetime(2023, 12, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_time = timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)
    fake_timestamp = start_date + random_time + timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)
    return fake_timestamp
        

loan_user_id = users['user_id'].sample(jan_loaners).reset_index()['user_id']
loan_book_id = books['book_id'].sample(jan_loaners).reset_index()['book_id']

dec_loan['user_id'] = loan_user_id
dec_loan['book_id'] = loan_book_id

dec_loan = pd.merge(dec_loan, books, on='book_id', how='left')[['user_id','book_id','book_availability_id','copies']]
dec_loan = dec_loan.rename(columns={"copies":"total_copies"})

loan_date = [generate_fake_timestamp() for _ in range(len(dec_loan))]

dec_loan['loan_date'] = loan_date
dec_loan['loan_date'] = pd.to_datetime(dec_loan['loan_date'], format="%Y-%m-%d %H:%M:%S")

dec_loan = dec_loan.sort_values(by=['loan_date'], ascending=True)
dec_loan = dec_loan.drop_duplicates(subset=['book_id','book_availability_id'], keep='first')
dec_loan['due_date'] = dec_loan['loan_date'] + pd.DateOffset(weeks=2)

def random_datetime(start, end):
    delta = end - start
    fraction = random.random()
    return start + fraction * delta

dec_loan['return_date'] = dec_loan['loan_date'].apply(lambda x: random_datetime(x + timedelta(days=1), x + timedelta(weeks=2)))
dec_loan['return_date'] = dec_loan['return_date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

dec_loan = dec_loan.astype({"user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
dec_loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90 entries, 43 to 41
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               90 non-null     int32         
 1   book_id               90 non-null     int32         
 2   book_availability_id  90 non-null     int32         
 3   total_copies          90 non-null     int32         
 4   loan_date             90 non-null     datetime64[ns]
 5   due_date              90 non-null     datetime64[ns]
 6   return_date           90 non-null     object        
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 4.2+ KB


In [86]:
dec_loan.head()

Unnamed: 0,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
43,2122,78948,556229357,1,2023-12-03 01:00:14,2023-12-17 01:00:14,2023-12-09 06:02:35
83,2922,91836,545177473,2,2023-12-03 06:29:35,2023-12-17 06:29:35,2023-12-12 11:49:02
71,2978,42933,448927445,2,2023-12-03 06:43:49,2023-12-17 06:43:49,2023-12-06 16:31:00
89,3454,87831,615584543,1,2023-12-03 18:06:08,2023-12-17 18:06:08,2023-12-10 17:40:08
75,8639,15543,628356723,2,2023-12-03 20:32:56,2023-12-17 20:32:56,2023-12-10 05:38:31


In [87]:
loans = pd.concat([jan_loan, feb_loan, mar_loan, apr_loan, may_loan, jun_loan,
                   jul_loan, aug_loan, sep_loan, oct_loan, nov_loan, dec_loan], axis=0)

loans = loans.reset_index().drop(columns=['index'])

def get_ids():
    return ''.join(random.choice(string.digits[1:]) for n in range(7))

loan_id = []
while len(loan_id) != len(loans):
    id = get_ids()
    if id not in loan_id:
        loan_id.append(id)
        
loans['loan_id'] = loan_id
loans = pd.concat([loans['loan_id'], loans.drop('loan_id', axis=1)], axis=1)

print(loans.shape)
loans = loans.astype({"loan_id": int, "user_id": int, "book_id": int, "book_availability_id": int, "total_copies": int})
loans.to_csv('data/loans.csv', index=False)
loans.info()

(1104, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   loan_id               1104 non-null   int32         
 1   user_id               1104 non-null   int32         
 2   book_id               1104 non-null   int32         
 3   book_availability_id  1104 non-null   int32         
 4   total_copies          1104 non-null   int32         
 5   loan_date             1104 non-null   datetime64[ns]
 6   due_date              1104 non-null   datetime64[ns]
 7   return_date           1104 non-null   object        
dtypes: datetime64[ns](2), int32(5), object(1)
memory usage: 47.6+ KB


In [88]:
loans.head()

Unnamed: 0,loan_id,user_id,book_id,book_availability_id,total_copies,loan_date,due_date,return_date
0,3691978,6999,16232,284787677,1,2023-01-01 09:14:07,2023-01-15 09:14:07,2023-01-13 18:10:08
1,2992892,2719,25268,277377247,1,2023-01-01 21:05:13,2023-01-15 21:05:13,2023-01-14 22:18:31
2,3223739,4748,98926,897722137,1,2023-01-02 16:46:50,2023-01-16 16:46:50,2023-01-11 19:12:38
3,7574296,3844,64575,134435244,2,2023-01-02 16:48:50,2023-01-16 16:48:50,2023-01-16 00:44:48
4,6724996,2499,66538,755748586,1,2023-01-02 17:42:29,2023-01-16 17:42:29,2023-01-05 04:09:58


# Dataset holds

| hold_id | PK | INT NOT NULL UNIQUE |
|---------|----|---------------------|
| user_id | FK | INT NOT NULL |
| book_id | FK | INT NOT NULL |
| book_availability_id | FK | INT NOT NULL |
| total_copies | | INT NOT NULL CHECK(>=1 AND <=2) |
| hold_date | | TIMESTAMP NOT NULL |
| exp_date | | TIMESTAMP NOT NULL |

In [89]:
loans = pd.read_csv('data/loans.csv')
users = pd.read_csv('data/users.csv')

date_columns = ['loan_date', 'due_date', 'return_date']

loans[date_columns] = loans[date_columns].apply(pd.to_datetime)

loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   loan_id               1104 non-null   int64         
 1   user_id               1104 non-null   int64         
 2   book_id               1104 non-null   int64         
 3   book_availability_id  1104 non-null   int64         
 4   total_copies          1104 non-null   int64         
 5   loan_date             1104 non-null   datetime64[ns]
 6   due_date              1104 non-null   datetime64[ns]
 7   return_date           1104 non-null   datetime64[ns]
dtypes: datetime64[ns](3), int64(5)
memory usage: 69.1 KB


## January

In [90]:
holds_jan = loans[loans['loan_date'].dt.month == 1].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_jan) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_jan) < n_holders + 1:
    holds_jan = pd.concat([holds_jan, holds_jan.sample(n=1)], axis=0)

holds_jan = holds_jan.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_jan['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_jan['hold_date'] = holds_jan.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_jan['exp_date'] = holds_jan['hold_date'] + pd.DateOffset(weeks=2)
holds_jan = holds_jan.drop(columns=['user_id','loan_date','return_date'])
holds_jan = holds_jan.reset_index().drop(columns=['index'])
holds_jan.insert(0, 'user_id', user_id['user_id'])
holds_jan = holds_jan.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_jan.info()
display(holds_jan.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               172 non-null    int32         
 1   book_id               172 non-null    int32         
 2   book_availability_id  172 non-null    int32         
 3   total_copies          172 non-null    int32         
 4   hold_date             172 non-null    datetime64[ns]
 5   exp_date              172 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 5.5 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,7786,16232,284787677,1,2023-01-06 12:09:01,2023-01-20 12:09:01
1,8984,25268,277377247,1,2023-01-07 11:22:29,2023-01-21 11:22:29
2,5375,25268,277377247,1,2023-01-03 08:47:16,2023-01-17 08:47:16
3,4293,25268,277377247,1,2023-01-03 09:42:38,2023-01-17 09:42:38
4,1552,98926,897722137,1,2023-01-03 05:41:21,2023-01-17 05:41:21


## February

In [91]:
holds_feb = loans[loans['loan_date'].dt.month == 2].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_feb) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_feb) < n_holders + 1:
    holds_feb = pd.concat([holds_feb, holds_feb.sample(n=1)], axis=0)

holds_feb = holds_feb.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_feb['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_feb['hold_date'] = holds_feb.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_feb['exp_date'] = holds_feb['hold_date'] + pd.DateOffset(weeks=2)
holds_feb = holds_feb.drop(columns=['user_id','loan_date','return_date'])
holds_feb = holds_feb.reset_index().drop(columns=['index'])
holds_feb.insert(0, 'user_id', user_id['user_id'])
holds_feb = holds_feb.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_feb.info()
display(holds_feb.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               93 non-null     int32         
 1   book_id               93 non-null     int32         
 2   book_availability_id  93 non-null     int32         
 3   total_copies          93 non-null     int32         
 4   hold_date             93 non-null     datetime64[ns]
 5   exp_date              93 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 3.0 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,7896,63364,519614751,1,2023-02-12 16:14:20,2023-02-26 16:14:20
1,9163,52627,755634141,2,2023-02-04 11:48:33,2023-02-18 11:48:33
2,5871,17972,732781188,1,2023-02-08 03:01:57,2023-02-22 03:01:57
3,9975,97896,972312153,2,2023-02-05 15:31:11,2023-02-19 15:31:11
4,2196,85543,497428773,2,2023-02-02 21:23:08,2023-02-16 21:23:08


## March

In [92]:
holds_mar = loans[loans['loan_date'].dt.month == 3].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_mar) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_mar) < n_holders + 1:
    holds_mar = pd.concat([holds_mar, holds_mar.sample(n=1)], axis=0)

holds_mar = holds_mar.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_mar['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_mar['hold_date'] = holds_mar.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_mar['exp_date'] = holds_mar['hold_date'] + pd.DateOffset(weeks=2)
holds_mar = holds_mar.drop(columns=['user_id','loan_date','return_date'])
holds_mar = holds_mar.reset_index().drop(columns=['index'])
holds_mar.insert(0, 'user_id', user_id['user_id'])
holds_mar = holds_mar.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_mar.info()
display(holds_mar.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               142 non-null    int32         
 1   book_id               142 non-null    int32         
 2   book_availability_id  142 non-null    int32         
 3   total_copies          142 non-null    int32         
 4   hold_date             142 non-null    datetime64[ns]
 5   exp_date              142 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 4.6 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,5462,77281,637357317,2,2023-03-09 06:25:34,2023-03-23 06:25:34
1,3837,98946,175847879,1,2023-03-05 23:00:58,2023-03-19 23:00:58
2,5257,27315,565924248,1,2023-03-01 12:12:21,2023-03-15 12:12:21
3,4747,13862,534793462,2,2023-03-05 08:28:14,2023-03-19 08:28:14
4,2845,13862,534793462,2,2023-03-11 13:16:24,2023-03-25 13:16:24


## April

In [93]:
holds_apr = loans[loans['loan_date'].dt.month == 4].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_apr) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_apr) < n_holders + 1:
    holds_apr = pd.concat([holds_apr, holds_apr.sample(n=1)], axis=0)

holds_apr = holds_apr.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_apr['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_apr['hold_date'] = holds_apr.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_apr['exp_date'] = holds_apr['hold_date'] + pd.DateOffset(weeks=2)
holds_apr = holds_apr.drop(columns=['user_id','loan_date','return_date'])
holds_apr = holds_apr.reset_index().drop(columns=['index'])
holds_apr.insert(0, 'user_id', user_id['user_id'])
holds_apr = holds_apr.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_apr.info()
display(holds_apr.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               180 non-null    int32         
 1   book_id               180 non-null    int32         
 2   book_availability_id  180 non-null    int32         
 3   total_copies          180 non-null    int32         
 4   hold_date             180 non-null    datetime64[ns]
 5   exp_date              180 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 5.8 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,5842,86448,584488222,2,2023-04-08 22:21:30,2023-04-22 22:21:30
1,5268,65541,195578735,1,2023-04-05 14:10:17,2023-04-19 14:10:17
2,9243,65541,195578735,1,2023-04-03 15:32:47,2023-04-17 15:32:47
3,3828,29929,726957125,1,2023-04-03 01:21:01,2023-04-17 01:21:01
4,4514,25995,667746764,2,2023-04-09 16:44:42,2023-04-23 16:44:42


## May

In [94]:
holds_may = loans[loans['loan_date'].dt.month == 5].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_may) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_may) < n_holders + 1:
    holds_may = pd.concat([holds_may, holds_may.sample(n=1)], axis=0)

holds_may = holds_may.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_may['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_may['hold_date'] = holds_may.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_may['exp_date'] = holds_may['hold_date'] + pd.DateOffset(weeks=2)
holds_may = holds_may.drop(columns=['user_id','loan_date','return_date'])
holds_may = holds_may.reset_index().drop(columns=['index'])
holds_may.insert(0, 'user_id', user_id['user_id'])
holds_may = holds_may.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_may.info()
display(holds_may.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               152 non-null    int32         
 1   book_id               152 non-null    int32         
 2   book_availability_id  152 non-null    int32         
 3   total_copies          152 non-null    int32         
 4   hold_date             152 non-null    datetime64[ns]
 5   exp_date              152 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 4.9 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,1544,87279,754371786,2,2023-05-03 01:26:07,2023-05-17 01:26:07
1,8578,87279,754371786,2,2023-05-01 19:57:38,2023-05-15 19:57:38
2,4874,14582,176933422,2,2023-05-06 10:04:58,2023-05-20 10:04:58
3,8893,23978,468732731,1,2023-05-11 04:32:33,2023-05-25 04:32:33
4,3678,94593,238665433,2,2023-05-11 21:41:08,2023-05-25 21:41:08


## June

In [95]:
holds_jun = loans[loans['loan_date'].dt.month == 6].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_jun) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_jun) < n_holders + 1:
    holds_jun = pd.concat([holds_jun, holds_jun.sample(n=1)], axis=0)

holds_jun = holds_jun.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_jun['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_jun['hold_date'] = holds_jun.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_jun['exp_date'] = holds_jun['hold_date'] + pd.DateOffset(weeks=2)
holds_jun = holds_jun.drop(columns=['user_id','loan_date','return_date'])
holds_jun = holds_jun.reset_index().drop(columns=['index'])
holds_jun.insert(0, 'user_id', user_id['user_id'])
holds_jun = holds_jun.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_jun.info()
display(holds_jun.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               94 non-null     int32         
 1   book_id               94 non-null     int32         
 2   book_availability_id  94 non-null     int32         
 3   total_copies          94 non-null     int32         
 4   hold_date             94 non-null     datetime64[ns]
 5   exp_date              94 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 3.1 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,9641,77281,459594237,1,2023-06-01 06:55:08,2023-06-15 06:55:08
1,7899,18744,236373627,1,2023-06-11 18:05:35,2023-06-25 18:05:35
2,5142,71548,993682987,1,2023-06-02 09:33:13,2023-06-16 09:33:13
3,8455,24774,791967452,2,2023-06-03 23:38:33,2023-06-17 23:38:33
4,4485,42995,966859147,1,2023-06-04 16:02:08,2023-06-18 16:02:08


## July

In [96]:
holds_jul = loans[loans['loan_date'].dt.month == 7].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_jul) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_jul) < n_holders + 1:
    holds_jul = pd.concat([holds_jul, holds_jul.sample(n=1)], axis=0)

holds_jul = holds_jul.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_jul['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_jul['hold_date'] = holds_jul.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_jul['exp_date'] = holds_jul['hold_date'] + pd.DateOffset(weeks=2)
holds_jul = holds_jul.drop(columns=['user_id','loan_date','return_date'])
holds_jul = holds_jul.reset_index().drop(columns=['index'])
holds_jul.insert(0, 'user_id', user_id['user_id'])
holds_jul = holds_jul.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_jul.info()
display(holds_jul.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               92 non-null     int32         
 1   book_id               92 non-null     int32         
 2   book_availability_id  92 non-null     int32         
 3   total_copies          92 non-null     int32         
 4   hold_date             92 non-null     datetime64[ns]
 5   exp_date              92 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 3.0 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,7681,65383,636757197,2,2023-07-02 04:04:37,2023-07-16 04:04:37
1,4998,73968,973761964,2,2023-07-09 18:42:07,2023-07-23 18:42:07
2,7376,77547,858784251,1,2023-07-06 23:45:02,2023-07-20 23:45:02
3,2649,71131,551628112,1,2023-07-15 00:40:01,2023-07-29 00:40:01
4,2482,72513,516867531,1,2023-07-10 22:45:55,2023-07-24 22:45:55


## August

In [97]:
holds_aug = loans[loans['loan_date'].dt.month == 8].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_aug) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_aug) < n_holders + 1:
    holds_aug = pd.concat([holds_aug, holds_aug.sample(n=1)], axis=0)

holds_aug = holds_aug.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_aug['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_aug['hold_date'] = holds_aug.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_aug['exp_date'] = holds_aug['hold_date'] + pd.DateOffset(weeks=2)
holds_aug = holds_aug.drop(columns=['user_id','loan_date','return_date'])
holds_aug = holds_aug.reset_index().drop(columns=['index'])
holds_aug.insert(0, 'user_id', user_id['user_id'])
holds_aug = holds_aug.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_aug.info()
display(holds_aug.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               127 non-null    int32         
 1   book_id               127 non-null    int32         
 2   book_availability_id  127 non-null    int32         
 3   total_copies          127 non-null    int32         
 4   hold_date             127 non-null    datetime64[ns]
 5   exp_date              127 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 4.1 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,1158,44568,136231175,1,2023-08-06 11:19:52,2023-08-20 11:19:52
1,9977,97134,781456463,1,2023-08-01 23:34:48,2023-08-15 23:34:48
2,8491,97134,781456463,1,2023-08-04 18:20:30,2023-08-18 18:20:30
3,6173,97134,781456463,1,2023-08-04 15:24:43,2023-08-18 15:24:43
4,8855,51179,872675938,1,2023-08-03 14:02:51,2023-08-17 14:02:51


## September

In [98]:
holds_sep = loans[loans['loan_date'].dt.month == 9].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_sep) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_sep) < n_holders + 1:
    holds_sep = pd.concat([holds_sep, holds_sep.sample(n=1)], axis=0)

holds_sep = holds_sep.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_sep['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_sep['hold_date'] = holds_sep.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_sep['exp_date'] = holds_sep['hold_date'] + pd.DateOffset(weeks=2)
holds_sep = holds_sep.drop(columns=['user_id','loan_date','return_date'])
holds_sep = holds_sep.reset_index().drop(columns=['index'])
holds_sep.insert(0, 'user_id', user_id['user_id'])
holds_sep = holds_sep.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_sep.info()
display(holds_sep.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               246 non-null    int32         
 1   book_id               246 non-null    int32         
 2   book_availability_id  246 non-null    int32         
 3   total_copies          246 non-null    int32         
 4   hold_date             246 non-null    datetime64[ns]
 5   exp_date              246 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 7.8 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,9441,35773,491676974,2,2023-09-09 17:36:48,2023-09-23 17:36:48
1,6839,77378,689786581,2,2023-09-05 18:03:46,2023-09-19 18:03:46
2,2145,93144,497119379,2,2023-09-06 22:29:40,2023-09-20 22:29:40
3,8288,93144,497119379,2,2023-09-07 08:32:30,2023-09-21 08:32:30
4,7753,93144,497119379,2,2023-09-05 19:38:48,2023-09-19 19:38:48


## October

In [99]:
holds_oct = loans[loans['loan_date'].dt.month == 10].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_oct) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_oct) < n_holders + 1:
    holds_oct = pd.concat([holds_oct, holds_oct.sample(n=1)], axis=0)

holds_oct = holds_oct.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_oct['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_oct['hold_date'] = holds_oct.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_oct['exp_date'] = holds_oct['hold_date'] + pd.DateOffset(weeks=2)
holds_oct = holds_oct.drop(columns=['user_id','loan_date','return_date'])
holds_oct = holds_oct.reset_index().drop(columns=['index'])
holds_oct.insert(0, 'user_id', user_id['user_id'])
holds_oct = holds_oct.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_oct.info()
display(holds_oct.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               134 non-null    int32         
 1   book_id               134 non-null    int32         
 2   book_availability_id  134 non-null    int32         
 3   total_copies          134 non-null    int32         
 4   hold_date             134 non-null    datetime64[ns]
 5   exp_date              134 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 4.3 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,2144,94534,937474972,2,2023-10-04 09:17:49,2023-10-18 09:17:49
1,3498,11295,655888817,1,2023-10-12 06:32:12,2023-10-26 06:32:12
2,3655,11295,655888817,1,2023-10-04 14:31:57,2023-10-18 14:31:57
3,1928,11295,655888817,1,2023-10-03 03:11:15,2023-10-17 03:11:15
4,6911,11295,655888817,1,2023-10-07 09:06:54,2023-10-21 09:06:54


## November

In [100]:
holds_nov = loans[loans['loan_date'].dt.month == 11].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_nov) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_nov) < n_holders + 1:
    holds_nov = pd.concat([holds_nov, holds_nov.sample(n=1)], axis=0)

holds_nov = holds_nov.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_nov['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_nov['hold_date'] = holds_nov.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_nov['exp_date'] = holds_nov['hold_date'] + pd.DateOffset(weeks=2)
holds_nov = holds_nov.drop(columns=['user_id','loan_date','return_date'])
holds_nov = holds_nov.reset_index().drop(columns=['index'])
holds_nov.insert(0, 'user_id', user_id['user_id'])
holds_nov = holds_nov.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_nov.info()
display(holds_nov.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               91 non-null     int32         
 1   book_id               91 non-null     int32         
 2   book_availability_id  91 non-null     int32         
 3   total_copies          91 non-null     int32         
 4   hold_date             91 non-null     datetime64[ns]
 5   exp_date              91 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 3.0 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,5655,63755,397153242,2,2023-11-09 14:39:35,2023-11-23 14:39:35
1,7341,88617,593794718,1,2023-11-02 14:34:37,2023-11-16 14:34:37
2,8717,13687,945428136,1,2023-11-11 03:39:57,2023-11-25 03:39:57
3,3353,11847,582794552,2,2023-11-10 07:11:28,2023-11-24 07:11:28
4,4133,67115,631281638,2,2023-11-05 22:23:18,2023-11-19 22:23:18


## December

In [101]:
holds_dec = loans[loans['loan_date'].dt.month == 12].drop(columns=['due_date', 'loan_id'])

numbers = int(len(holds_dec) * 2.5)
n_holders = random.randint(1, numbers)

while len(holds_dec) < n_holders + 1:
    holds_dec = pd.concat([holds_dec, holds_dec.sample(n=1)], axis=0)

holds_dec = holds_dec.sort_values(by=['loan_date'], ascending=True)

user_id = users[~users['user_id'].isin(holds_dec['user_id'])]['user_id'].reset_index()
user_id = user_id['user_id'].sample(numbers * 2).reset_index().drop(columns='index')

holds_dec['hold_date'] = holds_dec.apply(
    lambda x: x['loan_date'] + timedelta(
        days=random.randint(0, (x['return_date'] - x['loan_date']).days), 
        hours=random.randint(0, 24),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
        ), axis=1
    )

holds_dec['exp_date'] = holds_dec['hold_date'] + pd.DateOffset(weeks=2)
holds_dec = holds_dec.drop(columns=['user_id','loan_date','return_date'])
holds_dec = holds_dec.reset_index().drop(columns=['index'])
holds_dec.insert(0, 'user_id', user_id['user_id'])
holds_dec = holds_dec.astype({'user_id': int, 'book_id': int, 'book_availability_id': int, 'total_copies': int})
holds_dec.info()
display(holds_dec.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               145 non-null    int32         
 1   book_id               145 non-null    int32         
 2   book_availability_id  145 non-null    int32         
 3   total_copies          145 non-null    int32         
 4   hold_date             145 non-null    datetime64[ns]
 5   exp_date              145 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int32(4)
memory usage: 4.7 KB


Unnamed: 0,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,4729,78948,556229357,1,2023-12-08 20:46:14,2023-12-22 20:46:14
1,8496,78948,556229357,1,2023-12-05 17:01:47,2023-12-19 17:01:47
2,3639,91836,545177473,2,2023-12-11 22:15:15,2023-12-25 22:15:15
3,6415,42933,448927445,2,2023-12-04 17:27:01,2023-12-18 17:27:01
4,7994,42933,448927445,2,2023-12-03 17:27:06,2023-12-17 17:27:06


In [102]:
holds = pd.concat([holds_jan, holds_feb, holds_mar,
                   holds_apr, holds_may, holds_jun,
                   holds_jul, holds_aug, holds_sep,
                   holds_oct, holds_nov, holds_dec], axis=0)

holds = holds.reset_index().drop(columns=['index'])

def get_ids():
    return ''.join(random.choice(string.digits[1:]) for n in range(6))

hold_id = []
while len(hold_id) != len(holds):
    id = get_ids()
    if id not in hold_id:
        hold_id.append(id)

holds.insert(0, 'hold_id', hold_id)
holds.to_csv('data/holds.csv', index=False)
holds.info()
display(holds.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   hold_id               1668 non-null   object        
 1   user_id               1668 non-null   int32         
 2   book_id               1668 non-null   int32         
 3   book_availability_id  1668 non-null   int32         
 4   total_copies          1668 non-null   int32         
 5   hold_date             1668 non-null   datetime64[ns]
 6   exp_date              1668 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int32(4), object(1)
memory usage: 65.3+ KB


Unnamed: 0,hold_id,user_id,book_id,book_availability_id,total_copies,hold_date,exp_date
0,263443,7786,16232,284787677,1,2023-01-06 12:09:01,2023-01-20 12:09:01
1,744493,8984,25268,277377247,1,2023-01-07 11:22:29,2023-01-21 11:22:29
2,345547,5375,25268,277377247,1,2023-01-03 08:47:16,2023-01-17 08:47:16
3,762732,4293,25268,277377247,1,2023-01-03 09:42:38,2023-01-17 09:42:38
4,718465,1552,98926,897722137,1,2023-01-03 05:41:21,2023-01-17 05:41:21


# Dataset rating

| rating_id | PK | INT NOT NULL UNIQUE |
|---------|----|---------------------|
| user_id | FK | INT NOT NULL |
| book_id | FK | INT NOT NULL |
| book_availability_id | FK | INT NOT NULL |
| rating_date | | TIMESTAMP NOT NULL |
| rating | | FLOAT |

In [103]:
loans = pd.read_csv('data/loans.csv', parse_dates=['loan_date']).drop(columns=['loan_id','due_date','return_date'])

rating = loans

rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               1104 non-null   int64         
 1   book_id               1104 non-null   int64         
 2   book_availability_id  1104 non-null   int64         
 3   total_copies          1104 non-null   int64         
 4   loan_date             1104 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(4)
memory usage: 43.2 KB


In [104]:
def get_ids():
    return ''.join(random.choice(string.digits[1:]) for n in range(8))

rating_id = []
while len(rating_id) != len(rating):
    id = get_ids()
    if id not in rating_id:
        rating_id.append(id)
        
rating.insert(0, 'rating_id', rating_id)

rating['rating_date'] = rating['loan_date'] + pd.DateOffset(hour=random.randint(0, 23), minute=random.randint(0, 59))

rating_n = []
while len(rating_n) != len(rating):
    value = random.randint(6, 10)
    rating_n.append(value)
    
rating['rating'] = rating_n
rating = rating.drop(columns='loan_date')

rating.to_csv('data/rating.csv', index=False)
rating.head()

  rating['rating_date'] = rating['loan_date'] + pd.DateOffset(hour=random.randint(0, 23), minute=random.randint(0, 59))


Unnamed: 0,rating_id,user_id,book_id,book_availability_id,total_copies,rating_date,rating
0,18999276,6999,16232,284787677,1,2023-01-01 20:00:07,7
1,84195914,2719,25268,277377247,1,2023-01-01 20:00:13,6
2,18936724,4748,98926,897722137,1,2023-01-02 20:00:50,8
3,77122994,3844,64575,134435244,2,2023-01-02 20:00:50,6
4,47933521,2499,66538,755748586,1,2023-01-02 20:00:29,6


In [105]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   rating_id             1104 non-null   object        
 1   user_id               1104 non-null   int64         
 2   book_id               1104 non-null   int64         
 3   book_availability_id  1104 non-null   int64         
 4   total_copies          1104 non-null   int64         
 5   rating_date           1104 non-null   datetime64[ns]
 6   rating                1104 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 60.5+ KB


# Export

In [114]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

In [107]:
# curr.close()
# conn.close()

## libraries

In [112]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/libraries.csv'

table_name = 'libraries'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    library_id INT NOT NULL UNIQUE PRIMARY KEY,
    library_name VARCHAR(255) NOT NULL UNIQUE,
    library_location VARCHAR(255) NOT NULL UNIQUE
    );
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r') as f:
    next(f)
    curr.copy_from(f, table_name, sep=',')
    
conn.commit()

curr.close()
conn.close()

## books

In [113]:
import csv

conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/books.csv'

table_name = 'books'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    book_id INT NOT NULL UNIQUE PRIMARY KEY,
    title VARCHAR(500) NOT NULL,
    author VARCHAR(500) NOT NULL,
    publisher VARCHAR(500) NOT NULL,
    publish_date DATE,
    ISBN VARCHAR(500)
);
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r', encoding='utf-8') as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        insert_query = f"""
        INSERT INTO {table_name} (book_id, title, author, publisher, publish_date, ISBN)
        VALUES (%s, %s, %s, %s, %s, %s)
        """
        curr.execute(insert_query, row)
        
conn.commit()

curr.close()
conn.close()

## users

In [115]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/users.csv'

table_name = 'users'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    user_id INT NOT NULL UNIQUE PRIMARY KEY,
    username VARCHAR(255) NOT NULL UNIQUE,
    password VARCHAR(255) NOT NULL,
    email VARCHAR(255) NOT NULL UNIQUE,
    name VARCHAR(255) NOT NULL
    );
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r') as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        insert_query = f"""
        INSERT INTO {table_name} (user_id, username, password, email, name)
        VALUES (%s, %s, %s, %s, %s)
        """
        curr.execute(insert_query, row)
        
conn.commit()

curr.close()
conn.close()

## book_categories

In [116]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

import ast 
import csv

csv_file_path = 'data/book_categories.csv'

table_name = 'book_categories'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    book_id INT NOT NULL UNIQUE PRIMARY KEY,
    categories TEXT[],
    CONSTRAINT fk_book_id FOREIGN KEY (book_id) REFERENCES books(book_id)
    );
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r') as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        categories_list = ast.literal_eval(row[1])
        insert_query = f"""
        INSERT INTO {table_name} (book_id, categories)
        VALUES (%s, %s)
        """
        curr.execute(insert_query, (int(row[0]), categories_list))
        
conn.commit()

curr.close()
conn.close()

## book_availability

In [118]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/book_availability.csv'

table_name = 'book_availability'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    book_availability_id INT NOT NULL UNIQUE PRIMARY KEY,
    library_id INT NOT NULL,
    book_id INT NOT NULL,
    copies INT DEFAULT 0,
    CONSTRAINT fk_library_id FOREIGN KEY (library_id) REFERENCES libraries(library_id),
    CONSTRAINT fk_book_id FOREIGN KEY (book_id) REFERENCES books(book_id)
    );
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r') as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        insert_query = f"""
        INSERT INTO {table_name} (book_availability_id, library_id, book_id, copies)
        VALUES (%s, %s, %s, %s)
        """
        curr.execute(insert_query, row)
        
conn.commit()

curr.close()
conn.close()

## loans

In [119]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/loans.csv'

table_name = 'loans'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    loan_id INT NOT NULL UNIQUE PRIMARY KEY,
    user_id INT NOT NULL,
    book_id INT NOT NULL,
    book_availability_id INT NOT NULL,
    total_copies INT NOT NULL CHECK(total_copies >= 1 AND total_copies <= 2),
    loan_date TIMESTAMP NOT NULL,
    due_date TIMESTAMP NOT NULL,
    return_date TIMESTAMP NOT NULL,
    CONSTRAINT fk_user_id FOREIGN KEY (user_id) REFERENCES users(user_id),
    CONSTRAINT fk_book_id FOREIGN KEY (book_id) REFERENCES books(book_id),
    CONSTRAINT fk_book_availability_id FOREIGN KEY (book_availability_id) REFERENCES book_availability(book_availability_id)
    );
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r') as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        insert_query = f"""
        INSERT INTO {table_name} (loan_id, user_id, book_id, book_availability_id, total_copies, loan_date, due_date, return_date)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """
        curr.execute(insert_query, row)
        
conn.commit()

curr.close()
conn.close()

## holds

In [120]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/holds.csv'

table_name = 'holds'

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    hold_id INT NOT NULL UNIQUE PRIMARY KEY,
    user_id INT NOT NULL,
    book_id INT NOT NULL,
    book_availability_id INT NOT NULL,
    total_copies INT NOT NULL CHECK(total_copies >= 1 AND total_copies <= 2),
    hold_date TIMESTAMP NOT NULL,
    exp_date TIMESTAMP NOT NULL,
    CONSTRAINT fk_user_id FOREIGN KEY (user_id) REFERENCES users(user_id),
    CONSTRAINT fk_book_id FOREIGN KEY (book_id) REFERENCES books(book_id),
    CONSTRAINT fk_book_availability_id FOREIGN KEY (book_availability_id) REFERENCES book_availability(book_availability_id)
    );
"""

curr.execute(create_table_query)

with open(csv_file_path, 'r') as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        insert_query = f"""
        INSERT INTO {table_name} (hold_id, user_id, book_id, book_availability_id, total_copies, hold_date, exp_date)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
        """
        curr.execute(insert_query, row)
        
conn.commit()

curr.close()
conn.close()

## rating

In [135]:
conn = psycopg2.connect(
    host='localhost',
    port='5432',
    user='postgres',
    password='ditepisungai',
    database='jaya-libraries-management-database'
)

csv_file_path = 'data/rating.csv'

table_name = 'rating'

rating = pd.read_csv('data/rating.csv', parse_dates=['rating_date'])

curr = conn.cursor()

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
    rating_id INT NOT NULL UNIQUE PRIMARY KEY,
    user_id INT NOT NULL,
    book_id INT NOT NULL,
    book_availability_id INT NOT NULL,
    rating_date TIMESTAMP NOT NULL,
    rating INT DEFAULT 0,
    CONSTRAINT fk_user_id FOREIGN KEY (user_id) REFERENCES users(user_id),
    CONSTRAINT fk_book_id FOREIGN KEY (book_id) REFERENCES books(book_id),
    CONSTRAINT fk_book_availability_id FOREIGN KEY (book_availability_id) REFERENCES book_availability(book_availability_id)
    );
"""

curr.execute(create_table_query)


for index, row in rating.iterrows():
    insert_query = f"""
    INSERT INTO {table_name} (rating_id, user_id, book_id, book_availability_id, rating_date, rating)
    VALUES (%s, %s, %s, %s, %s, %s)
    """
    curr.execute(insert_query, (
        int(row['rating_id']),
        int(row['user_id']),
        int(row['book_id']),
        int(row['book_availability_id']),
        row['rating_date'],
        int(row['rating']),
    ))
        
conn.commit()

curr.close()
conn.close()