In [None]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Introduction to Pandas

## From the Pandas documentation:

Here are just a few of the things that pandas does well:

- Easy handling of **missing data** (represented as NaN) in floating point as well as non-floating point data
- Size mutability: columns can be **inserted and deleted** from DataFrame and higher dimensional objects
- Automatic and explicit **data alignment**: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and let Series, DataFrame, etc. automatically align the data for you in computations
- Powerful, flexible **group by** functionality to perform split-apply-combine operations on data sets, for both aggregating and transforming data
- Make it **easy to convert** ragged, differently-indexed data in other Python and NumPy data structures into DataFrame objects
- Intelligent **label-based slicing**, **fancy indexing**, and **subsetting** of large data sets
- Intuitive **merging** and **joining** data sets
- Flexible **reshaping** and **pivoting** of data sets
- **Hierarchical labeling** of axes (possible to have multiple labels per tick)
- **Robust IO tools** for loading data from flat files (CSV and delimited), Excel files, databases, and saving / loading data from the ultrafast HDF5 format
- **Time series**-specific functionality: date range generation and frequency conversion, moving window statistics, moving window linear regressions, date shifting and lagging, etc.

### 10 Minutes Intro to Pandas ###
http://pandas.pydata.org/pandas-docs/stable/10min.html

## Set up Pandas default params

In [None]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline

In [None]:
print("Pandas version:",pd.__version__)
print("Numpy version:",np.__version__)

In [None]:
# various options in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

## Data structures

### 1. Series

One Dimensional Array / Vector of Values (Think these as your data columns).  One important aspect of them is that they carry an "index" (which you can think of as a row indicator).

### 2. Dataframes

Think of DataFrame as a Table with Columns.  This is the workhorse of everything you will do with data analysis.  Learning Pandas and its functions can be challenging, but stick with it and ask questions.  Structurally, a DataFrame can be thought of as a collection of Series objects with the same index.

### 3. [Panel Data](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Panel.html)

Three Dimensional Arrays  (Mentioned for reference, but we will not get much into these)

In [None]:
# So, what is a Pandas Dataframe

In [None]:
pd.Series?

In [None]:
## Make a Series
s = pd.Series([1,3,5,np.nan,6,8])

s

In [None]:
pd.DataFrame?

In [None]:
## Make a dataframe from a numpy array
df1 = pd.DataFrame(np.random.randn(6,4), columns=list('ABCD')) 
df1

In [None]:
## Make a dataframe from a dictionary
df2 = pd.DataFrame({ 'A' : 1., \
                    'B' : pd.Timestamp('20130102'), \
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'), \
                    'D' : np.array([3] * 4,dtype='int32'), \
                    'E' : pd.Categorical(["test","train","test","train"]), \
                    'F' : 'foo' })
df2

## Load a data set -- read_*

### "Census Income" dataset

http://archive.ics.uci.edu/ml/  
pandas can load a lot more than csvs, this tutorial shows how pandas can read excel, sql, and even copy and paste...
http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/

In [None]:
# download the data and name the columns
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
        'marital_status', 'occupation', 'relationship', 'ethnicity',
        'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
        'country_of_origin', 'income']

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                       names = cols)

**Q: What's happening in the above cell?**  

## Viewing Data

* .info() 
* .head()
* .tail()
* .columns
* .values
* .dtype

### info

Displays the Columns, Types, Rows and the memory used by the dataframe

In [None]:
# we can see there are no null values
# columns with numerical values are type int64, no need to set data type

df.info()

### Head

Displays the first few rows in the dataframe

In [None]:
help(df.drop)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
newdf = df.drop('hours_per_week',axis=1)

df.columns

In [None]:
newdf.info()

In [None]:
df.info()

In [None]:
# to view the first 5 or specify with ex: .head(10)
df.head(10)

### Tail

Displays the last few rows in the dataframe

In [None]:
df.tail()

### Sample

Displays a Sample of rows in the dataframe

In [None]:
# head and tail are good.  But sometimes we want to randomly sample data
df.sample(5, random_state=42)

### Columns

Returns a List of Columns in the dataframe

In [None]:
# view all columns of the dataframe
df.columns

### Column Types

Returns the type of each column

In [None]:
df.dtypes

## Rename columns

In [None]:
df.columns

In [None]:
# replace a column name
df.rename(columns = {'country_of_origin' : 'native_country'}, inplace = True)
df.head()

## Descriptives 

* .describe()
* .value_counts()
* .mean()
* .unique()

### Describe

Displays summary statistic for each Numerical column

In [None]:
df.describe()

### value_counts

Counts the number of occurances of each categorical value for the column

In [None]:
df['education']

In [None]:
type(df.education)

In [None]:
df.education.value_counts()

## Also works for numeric columns - treating the individual values as factors

In [None]:
type(df.education.value_counts())

In [None]:
df.education.value_counts().plot('barh')

In [None]:
df.hours_per_week.mean()

# Can also do:
df['hours_per_week'].mean()

## Comprehension Question:

What do you think we will get if we ask for the `type` of `df.hours_per_week` ?


In [None]:
df[['age', 'capital_gain', 'capital_loss', 'hours_per_week']].describe()

### Unique

Returns the unique values for the column

In [None]:
# there's a space before each string in this data
df.education.unique()

In [None]:
# looks like it's in every object column
df.workclass.unique()

In [None]:
df["education"] = df.education.str.strip()

In [None]:
# Hurray We removed the leading space
df.education.unique()

In [None]:
df.info()

In [None]:
df.education_num.value_counts()

In [None]:
df.gender.unique()

In [None]:
# Remove leading space in values
df["gender"] = df.gender.str.strip()

In [None]:
df.gender.unique()

## Selecting rows and columns 

### .loc 

* Selects row and columns by Names
* **by label**             `.loc[]`

### .iloc

* Selects row and columns by Index Position
* **by integer position**  `.iloc[]`


### .ix

* Redirects to `loc` or `iloc` based on input
* **for both**             `.ix[]`

http://pandas.pydata.org/pandas-docs/stable/indexing.html

In [None]:
# select a row
df.iloc[3]

In [None]:
df.iloc[0:3]

## Note: I got 3 rows returned, similar to the indexing that applies to Python lists

In [None]:
# select a range of rows
df.iloc[10:15]

In [None]:
# last 2 rows
df.iloc[-2:]

In [None]:
# selecting every other row in columns 3-5
df.iloc[::2, 2:5].head()

In [None]:
# select a row
df.loc[0:3]

## Question:
Why did I get 4 rows above here instead of 3?





Integers vs. labels!

In [None]:
(df.loc[0:2, 'age'])

In [None]:
df.ix[0:2, 2:6]

## Recall:
`df.ix` is primarily label-based, but "falls back" to integer-based (if the columns or index are not numerical).

## Filtering

In [None]:
(df.age > 50)

In [None]:
asd = df[df.age > 50].head(5)

In [None]:
# Filter for only certain Columns
df.loc[df.age > 50, ['age', 'education', 'occupation', 'gender', 'income']]

# What happens if I try to do the same with df.iloc instead of df.loc?
# What about df.ix?

In [None]:
df[df.age > 50].head(4)

## Now Filter on Gender

In [None]:
df.gender=='Male'

In [None]:
df.loc[df.gender=='Male',]

## Now Filter on Gender and Age between 30 and 40

In [None]:
(df.gender == 'Male') & (df.age>=30) & (df.age<=40)





In [None]:
(df.age>=30)&(df.gender=='Male')

df.loc[(df.age>=30)&(df.gender=='Male')&(df.age<=40),:]

## Find Nulls

In [None]:
# as we saw with df.info() there are no nulls... 
# but if there were this would find the rows where age is null
df[df.age.isnull()]

## Fill nulls

In [None]:
null_df = pd.DataFrame([1,2,4,np.nan], columns = ['column1'])

In [None]:
null_df

In [None]:
# you can also fill nulls with a value or string
null_df.column1.fillna(1000)

In [None]:
# fillna does not do it inplace unless you specify
null_df

In [None]:
# you can also fill null with the median or mean value of the column
null_df.fillna(null_df.column1.median(), inplace=True)
null_df

In [None]:
null_df.fillna('random_string')

## Drop nulls

In [None]:
null_df = pd.DataFrame([1,2,4,np.nan], columns = ['column1'])
null_df

In [None]:
null_df.dropna(how = 'any')

In [None]:
# .isnull() and .notnull() does opposite things
null_df.isnull()

In [None]:
null_df.notnull()

## Groupby

In [None]:
df.groupby('relationship').count()

In [None]:
# How to groupby column and apply a function like sum, count, or mean
df.groupby(['education']).mean()

In [None]:
df.groupby(['education','age',])[['hours_per_week','capital_gain']].mean()

In [None]:
# To groupby multiple columns with multiple functions attached
df.groupby(['income', 'native_country']).age.agg(['count', 'mean'])
# grouped in order of which column is listed first

In [None]:
# combine groupby with boolean
df[df.native_country == ' United-States'].groupby(['education']).hours_per_week.mean()

## Sorting
* ### sort_index() to sort by index
* ### sort_values() to sort by values

In [None]:
# groupby income and country and then sort by their mean age within each data block
df_grouped = df.groupby(['income','native_country']).mean().sort_values('age', ascending = True)
df_grouped

# Note: In this example, the groupby, mean, and sort functions are stringed together in one line
# in the next example, we will show a different syntax so that you could write them on separate
# lines to make the code a little easier to read

In [None]:
# We want to group people by their income and country
# Then sort them by their income ASCE, and then sort by average age within that group DESC 
(df
 .groupby(['income','native_country'])
 .mean()
 .reset_index()
 .sort_values(['income','age'], ascending=[True,False])
)

# Note: In this example, we sort by the SAME column which we grouped by earlier 
# (eg. we first groupby 'income' and then sort by 'income')
# In this case, we must use .reset_index() to re-index the groupby objects, because the 'income' 
# column no longer exists after the groupby and hence cannot be sorted directly