In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
book_data = pd.read_csv('/kaggle/input/amazon-books-reviews/books_data.csv')
book_rating = pd.read_csv('/kaggle/input/amazon-books-reviews/Books_rating.csv')

### Leave the columns we need

In [None]:
book_data = book_data[['Title', 'authors','publishedDate']]
book_rating = book_rating[['Id', 'Title','User_id', 'review/helpfulness','review/score', 'review/time', 'review/summary', 'review/text']]

### Get the percentage of null rows

In [None]:
book_data.isnull().mean() * 100

# We drop the NAs from the Title since we can't use books without titles. We drop the NA's in authors because later on we need to detect authors
# who might be using fake reviews. We drop NAs from the publishedDate since the Date of the book is needed in the study.

In [None]:
book_data = book_data.dropna()

In [None]:
book_rating.isnull().mean() * 100
#We drop the NAs from all the table. Concerning the User_id we need the non null ids.

### Get the shape of the resulting datasets

In [None]:
def get_shape(data):
    """
    Returns a formatted string with the number of rows and columns in the DataFrame.
    
    Parameters:
    data (pandas.DataFrame): The input DataFrame.
    
    Returns:
    str: A formatted string with the shape information.
    """
    return f'We have {data.shape[0]} rows and {data.shape[1]} columns in the given dataset.'

In [None]:
print(get_shape(book_rating))
print(get_shape(book_data))

### Join the 2 datasets to get only the books that have been rated and their published data. Thus, we inner join them.

In [None]:
final_data = pd.merge(book_rating, book_data, on=['Title'], how='inner')

### Get the shape of the final data

In [None]:
print(get_shape(final_data))

### Get the number of books inside the final dataset

In [None]:
final_data['Title'].nunique()

### Check the types of columns

In [None]:
final_data.head()

### Check the review/time and the published dates columns in order to put them in the same format

In [None]:
length = final_data['publishedDate'].apply(len)
value_counts = Counter(length)

In [None]:
value_counts

In [None]:
final_data['review/time'].head()

In [None]:
### The date inside the review/time column is in epoch so we need to convert it to 'YYYY-MM-DD'

def epoch_to_date(epoch_time):
    """
    Converts a Unix epoch timestamp to a date in the format 'YYYY-MM-DD'.
    
    Parameters:
    epoch_time (int): The Unix epoch timestamp to convert.
    
    Returns:
    str: A string representing the date in the format 'YYYY-MM-DD'.
    """
    date = datetime.datetime.fromtimestamp(epoch_time).strftime('%Y-%m-%d')
    return date


# Apply the epoch_to_date function to the 'timestamp' column
final_data['review/time'] = final_data['review/time'].apply(epoch_to_date)
final_data['review/time'] = pd.to_datetime(final_data['review/time'], errors='coerce')

In [None]:
### Check the length of the rows in the publishedDate column
length = final_data['publishedDate'].apply(len)
value_counts = Counter(length)

#### Lots of date rows with length of 4. Those rows are to be deleted since we can't check the month of publication, as we need to use the time of review compared to the published date in months.

In [None]:
value_counts

#### Checking the rows that have length of 7, we see that we have the year and the month. In order to have it in the same format as the review/time we will be adding 01 on the right of the date. As for the other rows with different lengths (Except 10) they will be dropped as they present a negligible number compared to the data.

In [None]:
# Filter the DataFrame to rows where the length of a row is 7
test_data = final_data[final_data['publishedDate'].apply(len) == 7]

# Get a random sample of 3 rows from the filtered DataFrame
sample_data = test_data.sample(n=3)
sample_data

In [None]:
#Filter the data that has publishedDate length of 7 or 10
updated_data = final_data[(final_data['publishedDate'].apply(len) == 7) | (final_data['publishedDate'].apply(len) == 10)]
# Add '-01' to the dates with length of 7
updated_data.loc[updated_data['publishedDate'].str.len() == 7, 'publishedDate'] += '-01'

In [None]:
#Check the counts of the lengths
length = updated_data['publishedDate'].apply(len)
value_counts = Counter(length)
value_counts

#### Checking the date range of the publishedDate column

In [None]:
print(updated_data['publishedDate'].min())
print(updated_data['publishedDate'].max())

In [None]:
updated_data.dtypes

In [None]:
# convert the date column to datetime format and replace invalid dates with NaT values
updated_data['publishedDate'] = pd.to_datetime(updated_data['publishedDate'], errors='coerce')

# check if any NaT values exist in the date column
if pd.isna(updated_data['publishedDate']).any():
    #Replace invalid dates with None values
    updated_data['publishedDate'] = updated_data['publishedDate'].where(updated_data['publishedDate'].notnull(), None)

# Drop the columns with None
updated_data = updated_data.dropna(how='any')

#### Getting the range of the publishedDate

In [None]:
#Check the range again
print(updated_data['publishedDate'].min())
print(updated_data['publishedDate'].max())

In [None]:
updated_data.shape

#### Selecting the books that have been published between 2005 and 2013

In [None]:
filtered = updated_data['publishedDate'].between('2005-01-01', '2013-12-31')
updated_data = updated_data[filtered]

In [None]:
#Check the range again
print(updated_data['publishedDate'].min())
print(updated_data['publishedDate'].max())

In [None]:
# Drop the Id and User_id columns
updated_data = updated_data.drop(['Id', 'User_id'], axis=1)

# Change the type of the Title column to string
updated_data['Title'] = updated_data['Title'].astype(str)

# Change the type of the review/helpfulness column to string
updated_data['review/helpfulness'] = updated_data['review/helpfulness'].astype(str)

# Change the type of the review/summary column to string
updated_data['review/summary'] = updated_data['review/summary'].astype(str)

# Change the type of the review/text column to string
updated_data['review/text'] = updated_data['review/text'].astype(str)

# Remove brackets and quotation marks from authors and change the type to string
updated_data['authors'] = updated_data['authors'].str.strip("[]").str.replace("'", "").astype(str)


In [None]:
updated_data.head()

#### We take a subset from the data containing the score and the text in order to label the reviews, either (1/Fake or 0/Real)

In [None]:
data_for_label = updated_data[['review/score', 'review/text']]