__Add column with genre information to original exported CSV.__

@Andreas Lüschow

12.11.2020

### Imports

In [None]:
from IPython.display import display

import json
import math
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

pd.options.display.max_columns = None

### Constants

In [None]:
# INPUT_CSV = "./data/genre_concat_final/genres_concat.csv"
INPUT_CSV = "./data/input/all_genre_data.csv"
OUTPUT_CSV = "./data/all_data.csv"
GENRE_FILE = "./data/input/genres.txt"
FIELD_FILE = "./data/input/fields.txt"
GENRE_MAPPING_FILE = "./data/input/genre_mapping.txt"

### Load data

In [None]:
src_df = pd.read_csv(INPUT_CSV, sep="\t", index_col='Unnamed: 0', low_memory=False)
src_df.shape

In [None]:
df = src_df.copy()

In [None]:
genres = [line.strip() for line in open(GENRE_FILE)]
assert len(genres) == 1319
print(genres[:20])

In [None]:
fields = [line.strip() for line in open(FIELD_FILE)]
print(fields)

In [None]:
with open(GENRE_MAPPING_FILE, "r", encoding="utf-8") as ifile:
    data = ifile.read()
    genre_mapping_dict = json.loads(data)

### Get genre for each data row

In [None]:
all_keywords = []  # collects genre keywords for all Pica+ fields
nr_genres_in_fields = []  # collects number of genres for each field (because one row can have multiple genres assigned)

In [None]:
for f in fields:
    keywords = []  # collects keywords for single Pica+ field
    try:
        nr_genres_in_rows = []
        for phrase in df[f].str.split(";"):  # cell may contain multiple phrases separated by ";"
            nr_genres_in_this_row = 0
            found = False
            clean_phrase = str(phrase).strip()
            if isinstance(phrase, list):
                for word in phrase:  # check if word is in genre list
                    clean_word = str(word).strip()
                    if clean_word in genres and not found:
                        keywords.append(clean_word)
                        found = True
                        nr_genres_in_this_row += 1
                        # break  # consider only first genre (we do not use multi-class classification)
                    elif clean_word in genres and found:
                        nr_genres_in_this_row += 1
                if not found:
                    keywords.append(None)  # no genre in this field available                    
            elif clean_phrase == "nan":
                keywords.append(None)
            elif clean_phrase in genres:
                keywords.append(clean_phrase)  # seems to be a single word
                nr_genres_in_this_row += 1
            nr_genres_in_rows.append(nr_genres_in_this_row)  # nr of genres in this row
        nr_genres_in_fields.append(nr_genres_in_rows)  # nr of genres in this field
        all_keywords.append(keywords)  # append keywords for single fields to collection list
    except:
        print(f"Feld {f} nicht vorhanden")  # useful for debugging

In [None]:
# count rows per field where more than 1 genre was assigned
all_multiple_count = []
for field in nr_genres_in_fields:
    multiple_field_count = 0
    for row in field:
        if row > 1:
            multiple_field_count += 1
    all_multiple_count.append(multiple_field_count)
all_multiple_count

In [None]:
# count rows that have more than 1 genre assigned
not_unique_count = 0
row_count = 0
genres_per_row = zip(*nr_genres_in_fields)
for row in list(genres_per_row):
    row_count += 1
    if sum(row) > 1:
        not_unique_count += 1
print(f"Not unique: {not_unique_count} out of {row_count} rows total.\n" + 
      f"Thus, {round((1-(not_unique_count/row_count))*100,2)} % are unique.")   

In [None]:
# create single genre for each row, based on genres in single fields
keyw = [list(a) for a in zip(*all_keywords)]
keyw_list = []  # collect one genre keyword for each data row

for group in keyw:
    found = False
    for elem in group:
        if type(elem) == str:
            keyw_list.append(str(elem))
            found = True
            break  # consider only one genre per data row
    if not found:
        keyw_list.append(None)  # used for debugging if no genre found (must be an error!)
        
# look at single entries (used for debugging if no genre found for a data row)
for index, k in enumerate(keyw_list):
    if not k:
        print(keyw_list[index-3])
        print(keyw_list[index-2])
        print(keyw_list[index-1])
        print(index)
        print(keyw_list[index+1])
        print(keyw_list[index+2])
        print(keyw_list[index+3])

In [None]:
# create 'genre' column in DataFrame
df["genre"] = keyw_list
df["genre"].isna().sum()  # check if every data row has genre

In [None]:
# there is one single row (!) in the source data where the genre is in the data
# but somehow not recognized ... add this manually (PPN: 1014723302)
df.at['1014723302', 'genre'] = "Brief"

In [None]:
# check for duplicate or wrong values in genre mapping dict
main_genres_list = []
main_genres_dict = {}
for k, v in genre_mapping_dict.items():
    if len(v) > 0:
        main_genres_list.append(v)
        if v in main_genres_dict.keys():
            main_genres_dict[v] += 1
        else:
            main_genres_dict[v] = 1
        
# sorted(main_genres_dict.items(), key=lambda x: x[1], reverse=True)
print(set(main_genres_list), len(main_genres_dict))

In [None]:
# create additional column with main genres (i.e., "Historischer Roman" --> "Roman")
mapped_keyw_list = [genre_mapping_dict[i] if i in genre_mapping_dict.keys() and len(genre_mapping_dict[i]) > 0 else i for i in keyw_list]
df["genre_main"] = mapped_keyw_list
df.at['1014723302', 'genre_main'] = "Brief"
df["genre_main"].isna().sum()  # check if every data row has a main genre

In [None]:
# show empty cell values (just for information)
df.isna().sum().sort_values()

In [None]:
# check if genre column an df have the same row count
assert len([x for x in df["genre"] if x]) == df.shape[0]
assert len([x for x in df["genre_main"] if x]) == df.shape[0]

### Save DataFrame that now has a genre column

In [None]:
df.to_csv(OUTPUT_CSV, sep="\t")