In [1]:
from google.colab import drive
import pandas as pd
import hashlib, os, binascii

In [68]:
# Cargar datos desde Google Drive
drive.mount('/content/drive')
df = pd.read_csv('drive/MyDrive/books.csv',  sep=';', on_bad_lines='skip', encoding='latin-1')

# Eliminar columnas que no nos interesan
df.drop(['Image-URL-S','Image-URL-M','Image-URL-L','Publisher'], axis=1, inplace = True)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv('drive/MyDrive/books.csv',  sep=';', on_bad_lines='skip', encoding='latin-1')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication
0,195153448,Classical Mythology,Mark P. O. Morford,2002
1,2005018,Clara Callan,Richard Bruce Wright,2001
2,60973129,Decision in Normandy,Carlo D'Este,1991
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999


In [3]:
# Consultar valores nulos
df.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
dtype: int64

In [69]:
# Sustituir valor nulo en el nombre de autor por 'Unknown'
df['Book-Author'].fillna('Unknown', inplace=True)
df.isnull().sum() # Ya no hay valores nulos

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
dtype: int64

In [5]:
# Consultar valores del año (hay ints y strings mezclados)
df['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [57]:
# El campo 'Book-Author' estaba vacío y la información del año se ha desplazado a la izquierda
df.loc[(df['Year-Of-Publication']=='DK Publishing Inc') | (df['Year-Of-Publication']=='Gallimard')]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc


In [70]:
# Corregir el año
df['Year-Of-Publication'] = df['Year-Of-Publication'].replace(['Gallimard'], '2003')
df['Year-Of-Publication'] = df['Year-Of-Publication'].replace(['DK Publishing Inc'], '2000')

# Poner el autor como 'Unknown'
df['Book-Author'] = df['Book-Author'].replace(['2003'], 'Unknown')
df['Book-Author'] = df['Book-Author'].replace(['2000'], 'Unknown')

# Convertir todos los años a int
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype('int64')

In [8]:
df['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984,    0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, 2012, 2006, 1909, 2008, 1378,
       1919, 1922, 1897, 2024, 1376, 2037])

In [71]:
# Este método genera una clave(hash) única para un string específico.
# Utilizaremos esta clave como 'id' del autor.

def generate_hash(name):

    # Codificar la cadena a bytes antes de generar el hash
    cadena_bytes = name.encode('latin1')

    # Calcular el hash SHA-256
    hash_obj = hashlib.sha256()
    hash_obj.update(cadena_bytes)
    id_unico = hash_obj.hexdigest()  # Obtener el hash como cadena hexadecimal
    #id_unico += binascii.b2a_hex(os.urandom(5)) # Añadir aleatoriedad
    return id_unico

In [72]:
ids = []

# Por cada nombre de autor se genera un id
for idx, row in df.iterrows():
    ids.append(generate_hash(row['Book-Author']))

# Introducir nueva columna 'ID-Author'
df['ID-Author'] = ids
df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,ID-Author
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,748f7875338d4e51009a5398b6058476d69d292a4c4acb...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,a42ff6c71dfe7af9c1d14d7f4922a2274ea77cffe12119...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,feacc24e12371e419c54adf6993c682046367b0eb61fcc...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,c065669c53ea1ece426fa1737d27dd4f9e20d8326ead64...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,48225b1dd70c861d87e26e6ccfcd2f3646e608f417f96f...
...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,21d7ce2411416fc10f0dcbb5423ae03d7ba457873de64d...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,be312f718a9b12c7e5930c2902aa66f0e4d540e3b5e691...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,517c861461b9d5627684de1dab8d64c441998634da87ec...
271358,0192126040,Republic (World's Classics),Plato,1996,1554fc76c25040f83ac969a355aee85aed80f408f9469e...


In [81]:
# Guardar 100 instacias en un nuevo csv
df.head(100).to_csv('drive/MyDrive/books_100_cleaned.csv')

In [None]:
# Guardar 10 mil instacias en un nuevo csv
df.head(10000).to_csv('drive/MyDrive/books_10k_cleaned.csv')