In [1]:
from string import ascii_letters

import pandas as pd
import numpy as np

from utils.paths import BOOKS_PATH, AUTHORS_PATH

In [2]:
books_df = pd.read_csv(BOOKS_PATH)
authors_df = pd.read_csv(AUTHORS_PATH)

  books_df = pd.read_csv(BOOKS_PATH)


In [3]:
data_df = books_df.join(authors_df.set_index('author_url'), on='author_href', how='inner')

In [4]:
# check if cycles always end with <cycle name> (<string> <number>)
data_df['Cykl'].dropna().str.extract(r'(\d)+').fillna(0).astype(int)[0].unique()

array([1, 2, 3, 4, 5, 6, 7, 0, 8, 9])

In [5]:
data_df['part_of_cycle'] = data_df['Cykl'].str.extract(r'(\d)+').fillna(0).astype(int)[0]

In [6]:
# drop records without number of pages
data_df = data_df.dropna(subset=['Liczba stron'])

In [7]:
numeric_columns = data_df.select_dtypes(include='number').columns

In [8]:
float_columns = 'author_average_rating'
integer_columns = numeric_columns.drop(float_columns)

In [9]:
# integer cols where there is nans
data_df.loc[:, integer_columns] = data_df.loc[:, integer_columns].fillna(0).astype(int)

# imputacja tytulow

In [10]:
data_df.columns

Index(['author', 'author_href', 'description', 'number_of_user_opinions',
       'number_of_user_ratings', 'number_of_discussions', 'Kategoria',
       'Format', 'Cykl', 'Tytuł oryginału',
       ...
       'author_href27', 'author_name', 'author_average_rating',
       'author_number_of_people_read', 'author_number_of_people_wants_to_read',
       'author_date_of_birth', 'author_number_of_fans',
       'author_number_of_books_written', 'author_number_of_awards',
       'part_of_cycle'],
      dtype='object', length=103)

In [11]:
missing_title = data_df['Tytuł oryginału'].isna()

In [12]:
data_df.loc[missing_title, 'title'] = data_df.loc[missing_title, 'url'].str.split('/').str[-1].str.split('-').str.join(' ').str.title()

In [13]:
data_df['part_of_cycle']

0        1
1        2
2        3
3        4
4        0
        ..
15493    0
15494    0
15495    1
15496    0
15497    2
Name: part_of_cycle, Length: 15186, dtype: int64

# cleaning

## author quantitative features:
author_average_rating
author_number_of_people_read
author_number_of_people_wants_to_read
author_date_of_birth ?
author_number_of_fans
author_number_of_books_written
author_number_of_awards
## book quantitative features
description_length (engineered)
number_of_user_opinions
number_of_user_ratings
number_of_discussions
number_of_people_read
Data wydania
Liczba stron
description
## book qualitative features
Kategoria
part_of_cycle (engineered)
Język
number_of_people_has
number_of_people_favorite
number_of_people_wants_to_read
number_of_people_wants_as_gift
number_of_people_currently_read

In [14]:
author_quant_features = [
    "author_average_rating",
    "author_number_of_people_read",
    "author_number_of_people_wants_to_read",
    "author_date_of_birth",  # Możliwe, że wymaga dodatkowego przetworzenia
    "author_number_of_fans",
    "author_number_of_books_written",
    "author_number_of_awards"
]

# Cechy ilościowe związane z książką
book_quant_features = [
    "number_of_user_opinions",
    "number_of_user_ratings",
    "number_of_discussions",
    "number_of_people_read",
    "Data wydania",  # to be transformed
    "Liczba stron",
    "Cykl",
] + [f'rating_{i}' for i in range(1, 11)]

# Cechy jakościowe związane z książką
book_qual_features = [
    "Kategoria",
    "part_of_cycle",  # engineered
    "Język",
    "number_of_people_has",
    "number_of_people_favorite",
    "number_of_people_wants_to_read",
    "number_of_people_wants_as_gift",
    "number_of_people_currently_read",
    "description",
    "Format"
]

In [15]:
all_features = author_quant_features + book_quant_features + book_qual_features

In [16]:
data_df[all_features].columns

Index(['author_average_rating', 'author_number_of_people_read',
       'author_number_of_people_wants_to_read', 'author_date_of_birth',
       'author_number_of_fans', 'author_number_of_books_written',
       'author_number_of_awards', 'number_of_user_opinions',
       'number_of_user_ratings', 'number_of_discussions',
       'number_of_people_read', 'Data wydania', 'Liczba stron', 'Cykl',
       'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5', 'rating_6',
       'rating_7', 'rating_8', 'rating_9', 'rating_10', 'Kategoria',
       'part_of_cycle', 'Język', 'number_of_people_has',
       'number_of_people_favorite', 'number_of_people_wants_to_read',
       'number_of_people_wants_as_gift', 'number_of_people_currently_read',
       'description', 'Format'],
      dtype='object')

In [17]:
data_df.loc[:, 'author_date_of_birth'] = pd.to_datetime(data_df['author_date_of_birth'], format='mixed', errors='coerce')

In [18]:
data_df.loc[:, 'author_date_of_birth']

0        1965-07-31 00:00:00
1        1965-07-31 00:00:00
2        1965-07-31 00:00:00
3        1965-07-31 00:00:00
4        1900-06-29 00:00:00
                ...         
15493                    NaT
15494    1905-06-21 00:00:00
15495    1952-10-24 00:00:00
15496    1963-01-01 00:00:00
15497                    NaT
Name: author_date_of_birth, Length: 15186, dtype: object

In [19]:
rename_dict = {
    "Data wydania": "publication_date",
    "Liczba stron": "number_of_pages",
    "Cykl": "series",
    "Kategoria": "category",
    "Język": "language",
    "Format": "format"
}

In [20]:
renamed_cols = [rename_dict.get(col, col) for col in data_df[all_features].columns]

In [21]:
renamed_cols

['author_average_rating',
 'author_number_of_people_read',
 'author_number_of_people_wants_to_read',
 'author_date_of_birth',
 'author_number_of_fans',
 'author_number_of_books_written',
 'author_number_of_awards',
 'number_of_user_opinions',
 'number_of_user_ratings',
 'number_of_discussions',
 'number_of_people_read',
 'publication_date',
 'number_of_pages',
 'series',
 'rating_1',
 'rating_2',
 'rating_3',
 'rating_4',
 'rating_5',
 'rating_6',
 'rating_7',
 'rating_8',
 'rating_9',
 'rating_10',
 'category',
 'part_of_cycle',
 'language',
 'number_of_people_has',
 'number_of_people_favorite',
 'number_of_people_wants_to_read',
 'number_of_people_wants_as_gift',
 'number_of_people_currently_read',
 'description',
 'format']

In [22]:
# data_df = data_df.rename(columns=rename_dict)
data_df = data_df.rename(columns=rename_dict)

In [23]:
data_df['title'] = data_df['url'].str.split('/').str[-1].str.split('-').str.join(' ').str.title()

In [24]:
data_df.columns

Index(['author', 'author_href', 'description', 'number_of_user_opinions',
       'number_of_user_ratings', 'number_of_discussions', 'category', 'format',
       'series', 'Tytuł oryginału',
       ...
       'author_name', 'author_average_rating', 'author_number_of_people_read',
       'author_number_of_people_wants_to_read', 'author_date_of_birth',
       'author_number_of_fans', 'author_number_of_books_written',
       'author_number_of_awards', 'part_of_cycle', 'title'],
      dtype='object', length=104)

In [25]:
data_df['description_length'] = data_df['description'].str.len()
data_df['number_of_pages'] = data_df['number_of_pages'].astype(int)


rating_features = [f'rating_{i}' for i in range(1, 11)]
scaled_features = [f'scaled_{element}' for element in rating_features]
normalized_features = [f'normalized_{element}' for element in rating_features]

# to be predicted
threshold = 2000
data_df['bestseller'] = (data_df['number_of_people_has'] >= threshold).astype(int)



# Assuming data and rating_features are already defined
total_ratings = data_df[rating_features].sum(axis=1)
rating_weights = np.arange(1, 11)

# mean rating
data_df['mean_rating'] = (data_df[rating_features] * rating_weights).sum(axis=1) / total_ratings

# standard deviation
mean_rating_array = data_df['mean_rating'].to_numpy()
data_df['std_rating'] = np.sqrt(((data_df[rating_features] * (rating_weights - mean_rating_array[:, None]) ** 2).sum(axis=1)) / total_ratings)

# skewness
data_df['skewness'] = ((data_df[rating_features] * (rating_weights - mean_rating_array[:, None]) ** 3).sum(axis=1)) / (total_ratings * data_df['std_rating'] ** 3)

# kurtosis
data_df['kurtosis'] = ((data_df[rating_features] * (rating_weights - mean_rating_array[:, None]) ** 4).sum(axis=1)) / (total_ratings * data_df['std_rating'] ** 4) - 3

In [42]:
data_df = data_df.loc[~data_df[['mean_rating', 'std_rating', 'skewness', 'kurtosis']].isna().any(axis=1)]

In [44]:
columns_to_save = renamed_cols + ['title', 'description_length', 'bestseller', 'mean_rating', 'std_rating', 'skewness', 'kurtosis']

In [45]:
data_df[columns_to_save].to_csv('data/cleaned_data.csv')