# CSVs

In [None]:
ls sources/

[sources/pycon-2013-talks.csv](sources/pycon-2013-talks.csv)

We could use Python's built-in [CSV module](https://pymotw.com/3/csv/).

But, because we're learning [Pandas](https://pandas.pydata.org/) today anyway, let's use that.

In [None]:
import pandas as pd

In [None]:
pd.read_csv('sources/pycon-2013-talks.csv')

Wait a minute.  That top line is wrong.

In [None]:
pd.read_csv?

In [None]:
header = ['type', 'title', 'fullname', 'firstname', 'lastname']

In [None]:
pd.read_csv('pycon-2013-talks.csv', header=header)

In [None]:
pd.read_csv('pycon-2013-talks.csv', names=header)

Really hard to work with unless you assign a name to it.

In [None]:
talks = pd.read_csv('pycon-2013-talks.csv', names=header)

In [None]:
talks

In [None]:
talks[0]

In [None]:
type(talks)

## Aside

If we want a plain-old Python list of dictionaries...

In [None]:
talks.to_dict('records')

... but we actually want to keep a Pandas DataFrame.

In [None]:
talks.iloc[0]

In [None]:
talks.iloc[0].title

In [None]:
talks.iloc[0].title.upper()

There's much more that's interesting/different about DataFrames, but more on that later.

# Plain-python evaluation

## Looping

In [None]:
for row in talks.itertuples():
    print(row)
    print('\n')

In [None]:
for row in talks.itertuples():
    print(row.title)

In [None]:
mentions_python = 0
does_not_mention_python = 0
for row in talks.itertuples():
    if 'python' in row.title.lower():
        mentions_python += 1 
    else:
        does_not_mention_python += 1 
print(f'{mentions_python} titles mention Python; {does_not_mention_python} do not')

## Title length statistics

In [None]:
lengths = [len(row.title) for row in talks.itertuples()]
lengths

In [None]:
average_len = sum(lengths) / len(lengths)

In [None]:
average_len

In [None]:
print(f'Average length {average_len}, min {min(lengths)}, max {max(lengths)}')

# Applying Pandas

In [None]:
talks.columns

In [None]:
talks.title

In [None]:
talks.title.apply(len)

In [None]:
talks.title.apply(len).max()

In [None]:
pd_lengths = talks.title.apply(len)

In [None]:
print(f'Average length {pd_lengths.mean()}, min {pd_lengths.min()}, max {pd_lengths.max()}')

In [None]:
pd_lengths.v

In [None]:
print(f'Standard deviation {pd_lengths.std()}, variance {pd_lengths.var()}')

In [None]:
talks.title.value_counts()

In [None]:
titles = talks.title.value_counts()

# Filter

In [None]:
titles[titles > 1]

In [None]:
## Pass your own functions

In [None]:
def ratio_vowels(phrase):
    "Ratio of a phrase's letters that are vowels."
    
    letters = [char for char in phrase if char.isalpha()]
    vowels = [letter for letter in letters if letter in ('a', 'e', 'i', 'o', 'u')]
    return len(vowels) / len(letters)
    
    
    

In [None]:
talks.title.apply(ratio_vowels)

In [None]:
talks['title_vowels'] = talks.title.apply(ratio_vowels)

In [None]:
talks['lastname_vowels'] = talks.lastname.apply(ratio_vowels)

In [None]:
5 / 0

In [None]:
import numpy as np

In [None]:
np.divide(5, 0)

In [None]:
def ratio_vowels(phrase):
    "Ratio of a phrase's letters that are vowels."
    
    letters = [char for char in phrase if char.isalpha()]
    vowels = [letter for letter in letters if letter in ('a', 'e', 'i', 'o', 'u')]
    return np.divide(len(vowels), len(letters))
    

In [None]:
talks['lastname_vowels'] = talks.lastname.apply(ratio_vowels)

In [None]:
talks[['lastname', 'lastname_vowels', 'title_vowels']]

In [None]:
talks.corr()

# Plot with [Altair](https://altair-viz.github.io/)

In [None]:
import altair

In [None]:
altair.Chart(talks).mark_point().encode(
    x='lastname_vowels', y='title_vowels')

# Function on DataFrame

In [None]:
talks['first_last_ratio'] = talks.apply(
    lambda talk: np.divide(len(talk.firstname), len(talk.lastname)),
    axis=1
    )


In [None]:
talks[['firstname', 'lastname', 'first_last_ratio']]

In [None]:
talks[talks.first_last_ratio == talks.first_last_ratio.max()]

In [None]:
talks.loc[talks.first_last_ratio.idxmax()]