In [1]:
import pandas as pd

In [2]:
gr_data = pd.read_csv('gr_data_genre_modified.csv')
gr_data = gr_data.drop(['Unnamed: 0'],axis=1)
display(gr_data.head(2),gr_data.shape)

Unnamed: 0,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,...,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,bbeScore,bbeVotes,price
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780440000000.0,"['Fantasy', 'Science Fiction', 'Dystopia', 'Yo...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",Hardcover,...,09/14/08,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",2993816,30516,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780440000000.0,"['Fantasy', 'Science Fiction', 'Magic', 'Young...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",Paperback,...,09/28/04,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,2632233,26923,7.38


(52478, 23)

#### List of unique genres

In [3]:
# Define an empty list to store the genres
genres_list = []

# Loop through all rows of genres
for i in range(len(gr_data)):
    # Get the genres for the current row
    genres_row = gr_data['genres'][i]

    # Remove the square brackets and single quotes
    genres_row = genres_row.replace('[','').replace(']','').replace("'",'')

    # Split the row into individual genres
    genres = genres_row.split(',')

    # Loop through each genre in the row
    for genre in genres:
        # Clean up the genre by removing any extra whitespace and converting to title case
        genre = genre.strip().title()
        # Check if the genre is not empty and not already in the genres_list
        if genre and genre not in genres_list:
            # If it contains non-ASCII characters, skip it
            if any(ord(c) > 127 for c in genre):
                continue
            # If it's not in the genres_list, add it
            else:
                genres_list.append(genre)

# Print the final genres list and its length
print("Length of uniques genres after cleaning: from 980 to",len(genres_list))
print("")
#print(genres_list)

Length of uniques genres after cleaning: from 980 to 617



### New genre dataframe

In [4]:
# New dataframe with columns named with unique genres of genres_list
genres_data = pd.DataFrame(columns=genres_list)
genres_data

Unnamed: 0,Fantasy,Science Fiction,Dystopia,Young Adult,Fiction,Action,Adventure,Romance,Magic,Children,...,Low Fantasy,Christian Contemporary Fiction,Battle Of Britain,Aircraft,Go,Civil War History,Racing,Led Zeppelin,10Th Century,Spanish History


From the 'goodreads_genre_REDUCTION' dataset, there were some genres with 0 counts that I couldn't remove in the original dataset. Now, I will proceed to drop those columns.

In [5]:
# Set of genres to delete from the lists
to_delete = ['World War Ii', '19Th Century', 'Lgbt', '20Th Century', 'Bdsm', '18Th Century',
             'Beauty And The Beast', '14Th Century', '15Th Century', '16Th Century', 
             'Sword And Sorcery', 'Dungeons And Dragons', '17Th Century', '21St Century',
             '13Th Century', '12Th Century', 'Sao Tome And Principe', 'Food And Drink',
             'History And Politics', 'Sword And Planet', 'Art And Photography', '2Nd Grade',
             '40K', '1St Grade', 'Nsfw', 'Mills And Boon', 'Gender And Sexuality', 
             '11Th Century', '10Th Century']

In [6]:
# Drop the columns from genres_data
genres_data = genres_data.drop(to_delete, axis=1)

In [None]:
# Looking for rows where genre value is an empty list
#empty_genre_rows = gr_data[gr_data['genres'] == '[]']
#display(empty_genre_rows.head(3), len(empty_genre_rows))

In [None]:
# Filling those empty lists with 'Unknown'
#gr_data['genres'].loc[empty_genre_rows.index] = '[Unknown]' * len(empty_genre_rows)

In [None]:
# Checking modification worked
#empty_genre_check = gr_data[gr_data['genres'] == '[]']
#display(empty_genre_check.head(3), len(empty_genre_check))

In [None]:
#gr_data['genres'][2040]

In [None]:
# Adding a column for books with 'Unknow' genre
#genres_data['Unknown'] = ''

In [7]:
genres_data.columns

Index(['Fantasy', 'Science Fiction', 'Dystopia', 'Young Adult', 'Fiction',
       'Action', 'Adventure', 'Romance', 'Magic', 'Children',
       ...
       'Herbs', 'Low Fantasy', 'Christian Contemporary Fiction',
       'Battle Of Britain', 'Aircraft', 'Go', 'Civil War History', 'Racing',
       'Led Zeppelin', 'Spanish History'],
      dtype='object', length=588)

In [8]:
genres_data.shape

(0, 588)

### Filling genre dataframe

ast is a built-in Python module that provides a way to work with abstract syntax trees (ASTs) in Python code. In this specific case, ast.literal_eval() is used to safely evaluate a string containing a Python literal structure, such as a list or dictionary, and return the corresponding Python object. This is used to convert a string representation of a list of genres into an actual list that can be iterated over in the subsequent code.

In [9]:
import ast

# Iterate over each value in gr_data['genres']
for i in range(len(gr_data['genres'])):
    
    # Retrieve the string value of the genres for the current row
    genre_list_str = gr_data['genres'][i]
    
    # Use ast.literal_eval() to convert the string value to a list of genres
    genre_list = ast.literal_eval(genre_list_str)

    # Check if the list of genres is empty
    if len(genre_list) == 0:
        
        # If the list is empty, set the value of 'Unknown' column to 1
        genres_data['Unknown'] = 1
        
    else:
        
        # If the list is not empty, iterate over each genre in the list
        for genre in genre_list:
            
            # Check if the genre is already a column in genres_data
            if genre in genres_data.columns:
                
                # If the genre is already a column, add 1 to the corresponding row for the current book
                genres_data.loc[i, genre] = 1
                
            else:
                
                # If the genre is not already a column, create a new column for the genre with all values set to 0
                genres_data[genre] = 0
                
                # Then, add 1 to the corresponding row for the current book
                genres_data.loc[i, genre] = 1
    
    # Print the current row number being processed
    print(f"Processed row {i+1}", end="\r")


Processed row 52478

In [10]:
genres_data.shape

(47855, 638)

In [11]:
genres_data.head()

Unnamed: 0,Fantasy,Science Fiction,Dystopia,Young Adult,Fiction,Action,Adventure,Romance,Magic,Children,...,1st Grade,TV and Series,NSFW,Mills and Boon,Maps and Cartography,Gender and Sexuality,11th Century,漫画,Veganism and Vegetarianism,10th Century
0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,,1.0,1,,1.0,,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,,1.0,1,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,,,1,,,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,,1.0,1,,,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Saving new dataframe of unique genres to csv
genres_data.to_csv('genres_df_transposed.csv')

### Loading new genre dataframe

In [None]:
genres_df2 = pd.read_csv('genres_df_transposed.csv')

In [None]:
display(genres_df2.head(), genres_df2.shape)

In [None]:
gr_data['genres'][0]

In [None]:
gr_data['genres'][1]

In [None]:
genres_df2

In [None]:
genres_df2['Unknown']

In [None]:
print("There's a difference of", len(gr_data)-len(genres_df), "rows between dataframes.")

In [None]:
genres_data.shape

In [None]:
# Saving new dataframe of unique genres to csv
genres_data.to_csv('genres_df.csv')

#### Importing new dataframe

In [None]:
genres_df = pd.read_csv('genres_df.csv')

In [None]:
display(genres_df.tail(), genres_df.shape)

In [None]:
display(gr_data.tail(), gr_data.shape)

In [None]:
print("There's a difference of", len(gr_data)-len(genres_df), "rows between dataframes.")

In [None]:
# Checking rows not included: description is [] empty

# Get a list of index values from gr_data that are not present in genres_df
diff_rows_index = gr_data[~gr_data.index.isin(genres_df['index_gr_data'])]

# Rows from gr_data that are not present in genres_df
diff_rows_index


In [None]:
genres_df = genres_df.rename(columns={'Unnamed: 0': 'index_gr_data'})

In [None]:
genres_df.head()

In [None]:
# Dealing with the presence of NaN values
genres_df = genres_df.fillna(0)

In [None]:
genres_df.dtypes

In [None]:
# Check if there is any column with just '0' values in it.
if not (genres_df != 0).all().all():
    zero_columns = genres_df.columns[genres_df.eq(0).all()]
    print("The following columns only hold 0:", zero_columns)
else:
    print("All columns in genres_df have at least one non-zero value.")

In [None]:
# Dropping these columns
genres_df = genres_df.drop(zero_columns, axis=1)

In [None]:
genres_df.shape

In [None]:
genres_df.dtypes

In [None]:
genres_df = genres_df.astype(int)
genres_df.dtypes

In [None]:
genres_df.head()

In [None]:
gr_data['genres'][0]

In [None]:
import ast

# Iterate over each value in gr_data['genres']
for i in range(len(gr_data['genres'])):
    genre_list_str = gr_data['genres'][i]
    genre_list = ast.literal_eval(genre_list_str)

    # Add 1 to corresponding columns in new_columns dataframe
    for genre in genre_list:
        if genre in genres_data.columns:
            genres_data.loc[i, genre] = 1
        else:
            genres_data[genre] = 0
            genres_data.loc[i, genre] = 1
    
    # Print the current row number and replace the previous line
    print(f"Processed row {i+1}", end="\r")
