In [1]:
import pandas as pd

In [2]:
gr_data = pd.read_csv('gr_data_modified.csv')
gr_data.reset_index(drop=False)
gr_data=gr_data.drop(['Unnamed: 0'],axis=1)
display(gr_data.head(3),gr_data.shape)

Unnamed: 0,title,series,author,rating,description,language,isbn,genres,characters,book_format,...,first_publish_date,awards,num_ratings,rating_by_stars,liked_perc,setting,bbe_score,bbe_votes,price,other_collabs
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780440000000.0,"['Adventure', 'Dystopia', 'Romance', 'Young Ad...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",Hardcover,...,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",2993816,30516,5.09,
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,J.K. Rowling,4.5,There is a door at the end of a silent corrido...,English,9780440000000.0,"['Adventure', 'Classics', 'Audiobook', 'Childr...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",Paperback,...,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,2632233,26923,7.38,"Mary GrandPré , Illustrator)"
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,10000000000000.0,"['Classics', 'Novels', 'Historical', 'School',...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",Paperback,...,07-11-1960,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,"['Maycomb, Alabama (United States)']",2269402,23328,,


(52478, 24)

#### List of unique genres

In [5]:
# Define an empty list to store the genres
genres_list = []

# Loop through all rows of genres
for i in range(len(gr_data)):
    # Get the genres for the current row
    genres_row = gr_data['genres'][i]

    # Remove the square brackets and single quotes
    genres_row = genres_row.replace('[','').replace(']','').replace("'",'')

    # Split the row into individual genres
    genres = genres_row.split(',')

    # Loop through each genre in the row
    for genre in genres:
        # Clean up the genre by removing any extra whitespace and converting to title case
        genre = genre.strip().title()
        # Check if the genre is not empty and not already in the genres_list
        if genre and genre not in genres_list:
            # If it contains non-ASCII characters, skip it
            if any(ord(c) > 127 for c in genre):
                continue
            # If it's not in the genres_list, add it
            else:
                genres_list.append(genre)

# Print the final genres list and its length
print("Length of uniques genres after cleaning: from 980 to",len(genres_list))
print("")
#print(genres_list)

Length of uniques genres after cleaning: from 980 to 639



### New genre dataframe

In [6]:
# New dataframe with columns from genres_list
genres_data = pd.DataFrame(columns=genres_list)
genres_data

Unnamed: 0,Adventure,Dystopia,Romance,Young Adult,Action,Fantasy,Fiction,Science Fiction,Classics,Audiobook,...,Battle Of Britain,Aircraft,Go,Goth,Civil War History,Racing,Led Zeppelin,Naturopathy,10Th Century,Spanish History


In [7]:
# Adding a column for books with no genre described = []
genres_data['Unknown'] = ''

In [8]:
genres_data.shape

(0, 640)

### Filling genre dataframe

ast is a built-in Python module that provides a way to work with abstract syntax trees (ASTs) in Python code. In this specific case, ast.literal_eval() is used to safely evaluate a string containing a Python literal structure, such as a list or dictionary, and return the corresponding Python object. This is used to convert a string representation of a list of genres into an actual list that can be iterated over in the subsequent code.

In [9]:
import ast

# Iterate over each value in gr_data['genres']
for i in range(len(gr_data['genres'])):
    
    # Retrieve the string value of the genres for the current row
    genre_list_str = gr_data['genres'][i]
    
    # Use ast.literal_eval() to convert the string value to a list of genres
    genre_list = ast.literal_eval(genre_list_str)

    # Check if the list of genres is empty
    if len(genre_list) == 0:
        
        # If the list is empty, set the value of 'Unknown' column to 1
        genres_data['Unknown'] = 1
        
    else:
        
        # If the list is not empty, iterate over each genre in the list
        for genre in genre_list:
            
            # Check if the genre is already a column in genres_data
            if genre in genres_data.columns:
                
                # If the genre is already a column, add 1 to the corresponding row for the current book
                genres_data.loc[i, genre] = 1
                
            else:
                
                # If the genre is not already a column, create a new column for the genre with all values set to 0
                genres_data[genre] = 0
                
                # Then, add 1 to the corresponding row for the current book
                genres_data.loc[i, genre] = 1
    
    # Print the current row number being processed
    print(f"Processed row {i+1}", end="\r")


Processed row 52478

In [10]:
genres_data.shape

(47855, 681)

In [20]:
genres_data.head()

Unnamed: 0,Adventure,Dystopia,Romance,Young Adult,Action,Fantasy,Fiction,Science Fiction,Classics,Audiobook,...,Sword and Planet,2nd Grade,40k,1st Grade,NSFW,Mills and Boon,Gender and Sexuality,11th Century,漫画,10th Century
0,1.0,1.0,1.0,1.0,1.0,1.0,1,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,,,1.0,,1.0,1,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,,1.0,,,1,,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,1.0,,,,1,,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,1.0,1.0,,1.0,1,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Saving new dataframe of unique genres to csv
genres_data.to_csv('genres_df_empty.csv')

In [16]:
genres_df2 = pd.read_csv('genres_df_empty.csv')

In [18]:
display(genres_df2.head(), genres_df2.shape)

Unnamed: 0.1,Unnamed: 0,Adventure,Dystopia,Romance,Young Adult,Action,Fantasy,Fiction,Science Fiction,Classics,...,Sword and Planet,2nd Grade,40k,1st Grade,NSFW,Mills and Boon,Gender and Sexuality,11th Century,漫画,10th Century
0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,,,1.0,,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,,,,1.0,,,1.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,,,1.0,,,,1.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,,,1.0,1.0,,1.0,1.0,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(47855, 682)

In [13]:
gr_data['genres'][0]

"['Adventure', 'Dystopia', 'Romance', 'Young Adult', 'Action', 'Fantasy', 'Fiction', 'Science Fiction']"

In [14]:
gr_data['genres'][1]

"['Adventure', 'Classics', 'Audiobook', 'Children', 'School', 'Young Adult', 'Fantasy', 'Magic', 'Fiction', 'Science Fiction']"

In [23]:
genres_df2

AttributeError: 'DataFrame' object has no attribute 'value'

In [19]:
genres_df2['Unknown']

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
47850    NaN
47851    NaN
47852    NaN
47853    NaN
47854    NaN
Name: Unknown, Length: 47855, dtype: float64

In [None]:
print("There's a difference of", len(gr_data)-len(genres_df), "rows between dataframes.")

In [None]:
import ast

# Iterate over each value in gr_data['genres']
for i in range(len(gr_data['genres'])):
    genre_list_str = gr_data['genres'][i]
    genre_list = ast.literal_eval(genre_list_str)

    # Add 1 to corresponding columns in new_columns dataframe
    for genre in genre_list:
        if genre in genres_data.columns:
            genres_data.loc[i, genre] = 1
        else:
            genres_data[genre] = 0
            genres_data.loc[i, genre] = 1
    
    # Print the current row number and replace the previous line
    print(f"Processed row {i+1}", end="\r")


In [None]:
genres_data.shape

In [None]:
# Saving new dataframe of unique genres to csv
genres_data.to_csv('genres_df.csv')

#### Importing new dataframe

In [None]:
genres_df = pd.read_csv('genres_df.csv')

In [None]:
display(genres_df.tail(), genres_df.shape)

In [None]:
display(gr_data.tail(), gr_data.shape)

In [None]:
print("There's a difference of", len(gr_data)-len(genres_df), "rows between dataframes.")

In [None]:
# Checking rows not included: description is [] empty

# Get a list of index values from gr_data that are not present in genres_df
diff_rows_index = gr_data[~gr_data.index.isin(genres_df['index_gr_data'])]

# Rows from gr_data that are not present in genres_df
diff_rows_index


In [None]:
genres_df = genres_df.rename(columns={'Unnamed: 0': 'index_gr_data'})

In [None]:
genres_df.head()

In [None]:
# Dealing with the presence of NaN values
genres_df = genres_df.fillna(0)

In [None]:
genres_df.dtypes

In [None]:
# Check if there is any column with just '0' values in it.
if not (genres_df != 0).all().all():
    zero_columns = genres_df.columns[genres_df.eq(0).all()]
    print("The following columns only hold 0:", zero_columns)
else:
    print("All columns in genres_df have at least one non-zero value.")

In [None]:
# Dropping these columns
genres_df = genres_df.drop(zero_columns, axis=1)

In [None]:
genres_df.shape

In [None]:
genres_df.dtypes

In [None]:
genres_df = genres_df.astype(int)
genres_df.dtypes

In [None]:
genres_df.head()

In [None]:
gr_data['genres'][0]