# Introduction to Pandas

Pandas documentation: http://pandas.pydata.org/pandas-docs/stable/ (homepage: https://pandas.pydata.org/)

Notes from Chapter 3 of https://jakevdp.github.io/PythonDataScienceHandbook/

In [None]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

In [None]:
# show
def show(data, show_data = 0):
    print ("  Type: {:}".format (type(data)))
    print (" Index: {:}".format(data.index))
    if type(data) == pd.core.frame.DataFrame:
        print ("Columns: {:}".format(data.columns))
    print (" Shape: {:}".format(data.shape))
    if show_data:
        print("  Data: {:}".format(data.values))

## Pandas Series Objects

In [None]:
# Create a Pandas Series object - uses the default (implicit) index
d1 = pd.Series([0.25, 0.5, 0.75, 1.0])
show(d1, 1)

In [None]:
d1

In [None]:
# So, what exactly is a Pandas Series?
# See the data types for the Series and its components
type(d1), type(d1.values), type(d1.index)

In [None]:
# What is this pandas.core.series?
dir(pd.core.series)

In [None]:
# pandas.core indexes?
dir(pd.core.indexes)

In [None]:
# pandas.core.indexes.range
dir(pd.core.indexes.range)

Note that we did not explicitly define an index -- the index was created implicitly when we created the Series object.

### Series as Generalized NumPy Arrays

In [None]:
# Series as generalized NumPy arrays
# Create the Series with the specfied (explicit) index values
# The "generalization" here is that we are defining a custom index.
d2 = pd.Series([0.25, 0.5, 0.75, 1.0],
               index=['a', 'b', 'c', 'd'])
show(d2, 1)

In [None]:
d2

In [None]:
d2[3]
# As expected, this is the 4th element, with (implicit) index 3

In [None]:
# Slicing - Note that element 3 is not inlcuded (as we expect)
d2[1:3]

In [None]:
# Using the explicit index, we directly access the values using the defined index (similar to a dictionary)
d2['d']

In [None]:
# Note that element 'c' is included here - More on this below (loc, iloc)
d2['b':'c']

In [None]:
# Note that the indices don't have to be numerical or "alphabetic" in the sequence-sense
pets = pd.Series(['dog', 'cat', 'fish', 'hamster'],
               index=['best', 'worst', 'useless', 'why'])
show(pets, 1)
pets

In [None]:
# access single object
pets['useless'], type(pets['useless'])

In [None]:
# slice
pets['useless':]

In [None]:
type(pets['useless':])

In [None]:
pets['best':'useless']

### Series as Generalized Python Dictionaries

In [None]:
# Creating a Series from a dictionary - keys -> Index; values -> Values.  
# Note that that Pandas object used to sort the series using the dictionary key sort order.
# As of version 0.23 -- it now maintains the order specified in the
# dict defintion -- see the documentation for details)
population_dict = {'California': 38332521,
                   'Texas'     : 26448193,
                   'New York'  : 19651127,
                   'Florida'   : 19552860,
                   'Illinois'  : 12882135}
population = pd.Series(population_dict)
show(population, 1)

In [None]:
population

In [None]:
# Note again that the Pandas object no longer stores in sort order.
sorted(population_dict.keys())

In [None]:
# Looks like a dictionary ....
population['Illinois']

In [None]:
# but with list-like slicing
population['Florida':'Illinois']

In [None]:
# along with "regular" slicing
population[3:5]

In [None]:
# Create a second series with geographical area
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
show(area,1)
area

In [None]:
# So, now we have two series that use the same index for different categories
# of values (population, area)
print("{:}\n\n{:}".format(population, area))
# Hopefully the direction is clear --> let's combine these using meaningful names.

## Pandas DataFrame Objects

Definition - Slides

In [None]:
# DataFrame is the ticket
states = pd.DataFrame({'population': population,'area': area})
show(states,1)

In [None]:
# Or in a user-friendly display
states

In [None]:
# Closer look at the DataFrame components and respective data types
states.index

In [None]:
states.columns

In [None]:
states.values

In [None]:
type(states), type(states.index), type(states.columns), type(states.values)
# Note here that both the index and the column sequence are defined as index objects

In [None]:
# extract one of the two Series using its column name
states['area']

In [None]:
# now the other Series ...
states['population']

In [None]:
# and the respective types
type(states), type(states['area']), type(states['population'])

In [None]:
# let's add another colum -- this time of non-numeric data
color_dict = {'California': 'blue', 'Texas': 'red', 'New York': 'blue',
             'Florida': 'purple', 'Illinois': 'purple'}
color = pd.Series(color_dict)
show(color,1)
color

In [None]:
# add our new column to the DataFrame
states['color'] = color.values
show(states,1)

In [None]:
# user-friendly form
states

In [None]:
# since NumPy arrays are homogeneous, the values array is now of type 'object'
states.values, type(states.values)

In [None]:
# but the individual series datatypes reflect the series data types.
states['area'].values, states['population'].values, states['color'].values

## Pandas Index Objects

From VP 03.01 - "The index object ... can be thought of as an immutable array or an ordered set (technically as a multi-set, as Index objects can have repeat values).

In [None]:
ind1 = states.index
ind1, type(ind1)

In [None]:
# Array-type operations
ind1[2:]

In [None]:
ind1[-2:]

In [None]:
ind1[::2]

In [None]:
# immutable
ind1[1] = 'Alabama'

In [None]:
# Set operations -- union, intersection, etc.
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [None]:
indA & indB  # intersection

In [None]:
indA.intersection(indB)

In [None]:
indA | indB  # union

In [None]:
indA.union(indB)

In [None]:
indA ^ indB  # symmetric difference

In [None]:
indA.symmetric_difference(indB)

## Indexers - loc and iloc

In [None]:
# Consider this example from VanderPlas
data = pd.Series(['a', 'b', 'c', 'd'], index=[1, 3, 5, 7])
show(data)
data

In [None]:
# Explicit index - note that with a zero-based indexing, we 
# would expect the second element of the list.
data[1]

In [None]:
# Implicit indexing when slicing - now we get the second and third 
# elements.
data[1:3]

This can cause confusion -- when is explicit indexing used and when is implicit indexing used ... Hence, loc and iloc.

In [None]:
# loc - always uses the explicit index 
data.loc[1]

In [None]:
data.loc[1:3]

In [None]:
# iloc - always uses the implicit Python-style index
data.iloc[1]

In [None]:
data.iloc[1:3]

In [None]:
# Note that even with non-integer indexes the slicing behavior is still different
data1 = pd.Series(['a', 'b', 'c', 'd'], index=['h','i','j','k'])
data1

In [None]:
data1[1:3]

In [None]:
data1.iloc[1:3]

In [None]:
data1.loc['i':'k']

## Data selection in a DataFrame

In [None]:
# Recreate our states DataFrame ...
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
color = pd.Series({'California': 'blue', 'Texas': 'red',
                 'New York': 'blue', 'Florida': 'red',
                 'Illinois': 'purple'})
data = pd.DataFrame({'area':area, 'pop':pop, 'color':color})
show(data)
data

In [None]:
# dictionary style indexing -- picking columns
data['area']

In [None]:
# works in cases where column names are character-based and
# are not reserve works
data.area

In [None]:
# What is the column that is returned
type(data.area)

In [None]:
# lists of columns -- note that you can specify the order and note
# the double brackets.
data[['pop', 'area']]

In [None]:
# and what is the type of the returned object?
type(data[['pop', 'area']])

In [None]:
# Add a new series using existing series
data['density'] = data['pop'] / data['area']
data
# Note the ufunc (universal function) behavior

In [None]:
# loc and iloc do element selection and slicing (rows)
data.iloc[1,2]

In [None]:
data.iloc[:3, :2]

In [None]:
data.iloc[1:3]

In [None]:
# Oops -- not using numerical index!
data.loc[1:3]

In [None]:
# Notice that this slice includes the "ending row" where the
# iloc-based slicing did not (it used standard Python slicing)
data.loc['Texas':'Florida']

In [None]:
# Masking
data[data.density > 100]
# what does 'data.density > 100' return (it's an expression that
# is evaluated)?

In [None]:
# when in doubt, check it out!
data.density > 100

In [None]:
data[(data.density > 100) & (data.area < 150000)]

In [None]:
# Masking + column selection
data.loc[data.density > 100, ['color', 'density']]

In [None]:
# let's look at the object
type(data.loc[data.density > 100, ['color', 'density']])

In [None]:
# lets create a named slice
high_density = data.loc[data.density > 100, ['color', 'density']]
high_density

In [None]:
# and change something
high_density.iloc[0,0] = "purple"
high_density

In [None]:
# what about the original?
data

Hmmmm ... So the slice is a copy and not a view.  What are the rules here (for Pandas slices)?
https://stackoverflow.com/questions/23296282/what-rules-does-pandas-use-to-generate-a-view-vs-a-copy

## See Data Indexing and Selection Examples notebook for more examples of indexing and data selection.

## Handling Missing Data

In [None]:
data = pd.Series([1, np.nan, 'hello', None])
show(data)
data

In [None]:
data.isnull()

In [None]:
# masking
data[data.notnull()]

In [None]:
# remove the missing data elements
data.dropna()

In [None]:
# dropna() created a new Series object -- the original is unchanged
data