# Introduction to Pandas

Pandas documentation: http://pandas.pydata.org/pandas-docs/stable/ (homepage: https://pandas.pydata.org/)

Notes from Chapter 3 of https://jakevdp.github.io/PythonDataScienceHandbook/

In [None]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

In [None]:
# show
def show(data, show_data = 0):
    print (" Index: {:}".format(data.index))
    if type(data) == pd.core.frame.DataFrame:
        print ("Columns: {:}".format(data.columns))
    print (" Shape: {:}".format(data.shape))
    if show_data:
        print(data.values)

## Pandas Series Objects

In [None]:
# Create a Pandas Series object - uses the default index
d1 = pd.Series([0.25, 0.5, 0.75, 1.0])
show(d1, 1)
d1

In [None]:
# Series as generalized NumPy arrays
# Create the Series with the specfied index values
d2 = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
show(d2, 1)
d2

In [None]:
d1[3]

In [None]:
d1[1:3]

In [None]:
d2['a']

In [None]:
d2['b':'d']

In [None]:
# Creating a Series from a dictionary.  Note that that Pandas object sorts
# the series using the dictionary key sort order.
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
show(population, 1)
population

In [None]:
# Looks like a dictionary ....
population['Illinois']

In [None]:
# but with list-like slicing
population['Illinois':'Texas']

In [None]:
# along with "regular" slicing
population[2:5]

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
show(area,1)
area

## Pandas DataFrame Objects

In [None]:
states = pd.DataFrame({'population': population,'area': area})
show(states,1)
states

In [None]:
states.index

In [None]:
states.columns

In [None]:
states['area']

In [None]:
type(states), type(states['area'])

## Pandas Index Objects

In [None]:
ind1 = states.index
ind1

In [None]:
ind1[2:]

In [None]:
ind1[-2:]

In [None]:
ind1[::2]

In [None]:
# Set operations -- union, intersection, etc.
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [None]:
indA & indB  # intersection

In [None]:
indA | indB  # union

In [None]:
indA ^ indB  # symmetric difference

## Indexers - loc, iloc, ix

In [None]:
# Consider this example
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

In [None]:
# Explicit index - note that with a zero-based indexing, we 
# would expect the second element of the list.
data[1]

In [None]:
# Implicit indexing when slicing - now we get the second and third 
# elements.
data[1:3]

In [None]:
# loc - always uses the explicit index 
data.loc[1]

In [None]:
data.loc[1:3]

In [None]:
# iloc - always uses the implicit Python-style index
data.iloc[1]

In [None]:
data.iloc[1:3]

In [None]:
# ix - hybrid - Note the deprecation warning.
data.ix[1]

In [None]:
data.ix[1:3]

## Data selection in a DataFrame

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

In [None]:
data['area']

In [None]:
data.area

In [None]:
# Add a new series using existing series
data['density'] = data['pop'] / data['area']
data

In [None]:
data.iloc[1,2]

In [None]:
data.iloc[:3, :2]

In [None]:
# Masking
data.loc[data.density > 100, ['pop', 'density']]

In [None]:
data[1:3]

In [None]:
data[data.density > 100]

## Handling Missing Data

In [None]:
data = pd.Series([1, np.nan, 'hello', None])

In [None]:
data.isnull()

In [None]:
data

In [None]:
data[data.notnull()]

In [None]:
data.dropna()

## Pandas MultiIndex

In [None]:
# define index as a list of tuples
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
index

In [None]:
# now redefine as a multiindex
index = pd.MultiIndex.from_tuples(index)
index

In [None]:
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index)
pop

In [None]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

In [None]:
pop_df['total']

In [None]:
pop_df.iloc[0]

In [None]:
pop_df.iloc[1]

In [None]:
pop_df.loc['California']

In [None]:
pop_df.loc['Texas']

In [None]:
pop[:,2000]

In [None]:
pop[pop > 22000000]