# Python para Lingüistas

Notebook 6: Formato CSV

Alejandro Ariza

University of Barcelona 2022

#### Formato CSV

CSV es un formato especial de fichero ("comma separated") que se utiliza mucho en programación.

Para procesar un fichero .csv necesitaréis importar la librería "csv".

In [1]:
import csv

In [2]:
# El siguiente código lee un fichero csv como un fichero normal
file_name = "movie_plots.csv"
num_lines = 0
raw_corpus = []

with open(file_name,"r",encoding="utf-8") as inp:
    for file_line in inp:
        raw_corpus.append(file_line)
        num_lines +=1
        if num_lines >= 5:
            break
            
# Como podéis ver, funciona aunque es difícil de leer
# Podéis ver en la última línea que leer el csv como un fichero de texto divide la línea en dos.
print(raw_corpus)
print("\n")
print(raw_corpus[3])
print(raw_corpus[4])


['Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot\n', '1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man\'s bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation\'s face before a group of policemen appear and order everybody to leave.[1]"\n', '1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon\'s sm

In [3]:
# El siguiente código lee el fichero csv como una lista usando la librería csv
file_name = "movie_plots.csv"
num_lines = 0
list_corpus = []

with open(file_name,"r",encoding="utf-8") as inp:
    reader=csv.reader(inp)
    for file_row in reader:
        list_corpus.append(file_row)
        num_lines +=1
        if num_lines >= 5:
            break
            
# Esta opción es mejor que la anterior - Es más fácil de leer y no divide las entradas/filas/líneas
print(list_corpus)
print("\n")
print(list_corpus[3])
print(list_corpus[4])

[['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot'], ['1901', 'Kansas Saloon Smashers', 'American', 'Unknown', '', 'unknown', 'https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers', "A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"], ['1901', 'Love by the Light of the Moon', 'American', 'Unknown', '', 'unknown', 'https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon', "The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and

In [4]:
# El siguiente código lee el fichero csv en un diccionario
file_name = "movie_plots.csv"
num_lines = 0
dict_corpus = []

with open(file_name,"r",encoding="utf-8") as inp:
    dict_reader=csv.DictReader(inp)
    for file_row in dict_reader:
        dict_corpus.append(file_row)
        num_lines +=1
        if num_lines >= 5:
            break
            
# Esta solución es mejor que la anterior puesto que ahora sabemos que significa cada campo
print(dict_corpus)
print("\n")
print(dict_corpus[3])
print(dict_corpus[4])


# Podemos acceder a información específica de cada película
print("\n El título de la tercera entrada es:")
print(dict_corpus[2]["Title"])

[{'Release Year': '1901', 'Title': 'Kansas Saloon Smashers', 'Origin/Ethnicity': 'American', 'Director': 'Unknown', 'Cast': '', 'Genre': 'unknown', 'Wiki Page': 'https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers', 'Plot': "A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"}, {'Release Year': '1901', 'Title': 'Love by the Light of the Moon', 'Origin/Ethnicity': 'American', 'Director': 'Unknown', 'Cast': '', 'Genre': 'unknown', 'Wiki Page': 'https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon', 'Plot': "The moon, painted with a smilin

In [17]:
# Tarea 1: manualmente crear un diccionario
# 
# Esta tarea es para practicar vuestras habilidades de programación generales y vuestro conocimiento del formato csv
# El objetivo es crear manualmente el diccionario que crea la librería csv utilizando un lector de fichero normal
#
# El csv.reader() lee el fichero línea por línea y devuelve una lista
# Vuestra tarea es convertir esta lista en un diccionario
# Primero, deberéis procesar la primera línea del fichero y obtener el nombre de todas las columnas/campos
# Entonces, por cada línea del fichero, tendreís que leer los valores y asignarlos a los campos correctos del
# diccionario

file_name = "movie_plots.csv"
num_lines = 0
list_corpus = []

with open(file_name,"r",encoding="utf-8") as inp:
    reader = csv.reader(inp)
    for i, line in enumerate(reader):
        if i == 0:
            header = line
        else:
            example = {header[j]: line[j] for j in range(len(line))}
            # example = {h_j: l_j for h_j, l_j in zip(header, line)}
            list_corpus.append(example)
            
# print(header)
# print(line)
# print(list(zip(header, line)))
            
print(list_corpus[:5])

[{'Release Year': '1901', 'Title': 'Kansas Saloon Smashers', 'Origin/Ethnicity': 'American', 'Director': 'Unknown', 'Cast': '', 'Genre': 'unknown', 'Wiki Page': 'https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers', 'Plot': "A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"}, {'Release Year': '1901', 'Title': 'Love by the Light of the Moon', 'Origin/Ethnicity': 'American', 'Director': 'Unknown', 'Cast': '', 'Genre': 'unknown', 'Wiki Page': 'https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon', 'Plot': "The moon, painted with a smilin

In [20]:
# Viasualizar el comportamiento de enumerate(Iterable)
a = [45, 89, 3, 2, 1]
list(enumerate(a))

[(0, 45), (1, 89), (2, 3), (3, 2), (4, 1)]

In [21]:
# Alternativa a enumerate() con for-loop y un contador
contador = 0

result = []
for element in a:
    result.append((contador, element))
    contador += 1
    
print(result)

[(0, 45), (1, 89), (2, 3), (3, 2), (4, 1)]


In [22]:
# Comprobar si un número está presente en una lista con el operador in
3 in a

True

In [26]:
# Comprobar si un número está presente en una lista con un for-loop. El comando break nos permite
# finalizar el bucle una vez lo hemos encontrado
for el in a:
    print(el)
    if 3 == el:
        print(True)
        break

45
89
3
True


In [None]:
# En la definición de una función puedes definir un valor por defecto para un parámetro OPCIONAL
def func_name(param1, param2, max_df=0.9):
    pass