# EPA1333 - Computer Engineering for Scientific Computing
## Week 6 - Oct 10, 2017

**Python Data Science Handbook**

*Jake VanderPlas*

In [None]:
from IPython.core.display import Image
Image('https://covers.oreillystatic.com/images/0636920034919/lrg.jpg')

### Cheat Sheets

[Mark Graph](http://markthegraph.blogspot.nl/) has created some cheat sheets for Python, Matplotlib and Pandas. You can download them from here: http://bit.ly/python_cs


## Pandas

Build on top of NumPy. It provides high-level data structures and manipulation tools 
for data analysis. 

  * Labeled axes
  * Arithmetic operations and reductions
  * Flexible handling of missing data
  * Time Series

The main datastructures are: **Series** and **DataFrame**



### Documentation
  * http://pandas.pydata.org/
  * http://pandas.pydata.org/pandas-docs/stable/10min.html
  
  * Google


In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import random

## Series

A one-dimensional array-like object and an index.

s = Series( [1, 2, 3, 4], index=['A', 'B', 'C', 'D'])


#### Creating a Series

In [None]:
# With a list. Explicitly providing the index
s = pd.Series( [1, 2, 3, 4], index=['A', 'B', 'C', 'D'])
s

In [None]:
# With a numpy array.
s = pd.Series( np.random.randn(5) )
s

In [None]:
# With a dict
cities = { 'Amsterdam' : 1000, 'Utrecht' : 2000, 'The Hague' : 3000, 'Rotterdam' : 4000}
s = pd.Series( cities )
s

In [None]:
# Name the index and the series
s.name = 'Code'
s.index.name = 'Cities'
s

### Accessing Series

In [None]:
# Accessing the index
s.index

In [None]:
# Accessing the values
s.values

In [None]:
# indexing on position. Result is a value.
s[1]

In [None]:
# slicing on position. Result is another Series
s[1:3]

In [None]:
# indexing on label (index)
s['Rotterdam']

In [None]:
# Slicing on label
# Note, when slicing on label the slice is INCLUDING the end.
s[:'Rotterdam']

In [None]:
# Be careful when label is integers too... 
s2 = pd.Series( range(5), index=[3,2,4,0,1])
s2

In [None]:
# This will select the label, not the position
s2[1]

In [None]:
# Slicing will select on position!
s2[1:3]

In [None]:
# Fancy indexing
s[ ['Utrecht', 'Amsterdam'] ]

In [None]:
# Check for values
s.isin([1000, 1500, 2500, 3000, 'Monkey'])

In [None]:
# Check for index 
'Utrecht' in s, 'Maastricht' in s

### Filtering and Functions

In [None]:
# Filtering and NumPy universal functions work as aspected
s[ s >= 2500 ]

In [None]:
s * 2.5

In [None]:
np.log( s )

In [None]:
# Count the values in a Series
s2 = pd.Series( np.random.randint(10, size=100) )
s2.value_counts( sort=False )

In [None]:
# Unique values only
s2 = pd.Series( ['a','b','c','a','d','b','g','d','g','g','c','e'])
s2.unique()

### NaN values

NaN values (Not a Number) represent missing values or NA values in the Pandas library.  


In [None]:
s2 = pd.Series( [ 1500, 500, np.nan, 10], index=['Maastricht', 'Groningen', 'Assen', 'Haarlem'])
s2

In [None]:
# Check for NaN values 
pd.isnull(s2) 

#s2.isnull()    # As an object method also works

#pd.notnull(s2)

### Assignment

In [None]:
# Change a value
s2['Haarlem'] = 3500
s2

In [None]:
# Add a new value
s2['Utrecht'] = 222
s2['Amsterdam'] = 333
s2['Rotterdam'] = 444
s2

In [None]:
# Addition is done based on index. NaN if no matching index found.
s + s2

## DataFrame

A DataFrame is a table/spreadsheet with rows and columns that can contain different types of data.

### Creation

In [None]:
# Using a matrix
df = pd.DataFrame( np.random.random( size=(3,4) ), 
                      columns=list("ABCD"), 
                      index=['first', 'second','third'] )
df

In [None]:
# Using a dictionary
d = { 'A' : range(30,37,3),
      'B' : np.random.random(3),
      'C' : [ random.random() for _ in range(3) ],
      'D' : ['John', 'Mary', 'Jane']
    }
df = pd.DataFrame( d, index=['first','second','third'] )
df

In [None]:
# Generic info
df.info()

In [None]:
# Statistics on the numerical data
df.describe()

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.values


### Accessing

In [None]:
# Selecting column, result is Series
df['B']

In [None]:
# Alternative, result is Series
df.B

In [None]:
# Fancy indexing, result is dataframe
df[ ['D','B'] ]

#df[ ['B'] ]

In [None]:
# Selecting rows, result is series
df.ix[1]

In [None]:
# slicing, result is dataframe
df.ix[ 1:2 ]

In [None]:
# Selecting elements [row, column]
df.ix[1,2]

In [None]:
# With slicing
df.ix[1:3, :3 ]

In [None]:
# Fancy indexing, using positions
# Warning: watch out if labels are integers too! 
# Use df.iloc and df.loc to use positions or labels explicitly.
# df.ix does a smart guess, but prioritizes labels.
df.ix[:2, [3,0,2]]

In [None]:
# Fancy indexing, using labels!
df.ix[ :'second', ['D', 'A', 'C'] ]

### Filtering

In [None]:
# Let's print df again
df

In [None]:
# Selecting values. Not matching values are replaced by NaN
df[df < 0.5]

In [None]:
# Selecting rows based on a Series of booleans
df[ df.D == 'Mary']

### Assignment

In [None]:
# Changing values
df.ix['first','A'] = 28
df

In [None]:
# Adding a new column
df['E'] = 500
df['F'] = np.sqrt(df['A'])
df['G'] = df['B'] / df['C']
df

In [None]:
# Reindexing a dataframe with ix
# Change the order of the index / columns and add new ones
df.ix[ ['third','first','second','fourth'], ['D','E','A','H','F','B','C','G']]

In [None]:
# Dropping entries. Drop a row
df.drop(['second'], axis=0)

In [None]:
# Drop a column (or 2)
df.drop( ['B','F'], axis=1)

### Arithmetic

NumPy universal functions still work as expected. Aggregation as well.

In [None]:
df1 = pd.DataFrame( np.arange(9).reshape(3,3), columns=list("abc"),
                  index=["John","Mary","Peter"])
df2 = pd.DataFrame( np.arange(12).reshape(4,3), columns=list("acd"),
                  index=["Anne", "John", "Mary", "Zach"])


In [None]:
df1

In [None]:
df2

In [None]:
# Addition matches both index and column, NaN for all other index, column combinations
df1 + df2

In [None]:
# Addition with fill-values
# Only if value is not in df1 and df2 will NaN appear.
df1.add(df2, fill_value=100)

In [None]:
# Universal functions, element wise

np.sqrt(df2)

In [None]:
# Aggregation, sum per column
df2.sum()

In [None]:
# sum per row
df2.sum(axis=1)

In [None]:
# Cumulative sums
df2.cumsum(axis=0)

Other functions that are available include:
  * count - count non-NA values
  * min/max
  * argmin, argmax - location of min/max value (Series)
  * idxmin, idxmax - index values of min/max (DataFrame)
  * sum, mean, var, std
  * cumsum, cumprod, cummin, cummax - cumulative

## Sorting and Rank

Sorting can be done on rows (index) and columns.

  * sort_index
  * sort_values
  

In [None]:
df

In [None]:
# Sort rows
df.sort_index(ascending=False)

In [None]:
# Sort rows, based on the values in a column
df.sort_values( by='B', ascending=True )

In [None]:
# Sort columns
df.sort_index(ascending=False, axis=1)

In [None]:
# Sorting series on index
s = pd.Series( np.random.randint( 10, size=5) )
s

In [None]:
s.sort_index( ascending=False )

In [None]:
# Sorting the values of the Series
s.sort_values()

In [None]:
# Ranking values
# method = average, first, min, max 
s.rank(method='average')

In [None]:
# Ranking in a dataframe
# First drop column D as it contains strings.
df.drop('D', axis=1).rank(method='first', axis=1)

### Handling missing data

Missing data is represented as NaN. There are ways to detect and deal with these
values.
  * isnull() / notnull() - test for NaN
  * dropna() - leave out NaN
  * fillna() - replace NaN with a value

In [None]:
from numpy import NaN as NA

d = pd.Series( [1, NA, 3.5, NA, 7])

d.dropna()

In [None]:
d[d.notnull()]

In [None]:
# dropna on dataframes drops entire rows/columns
df = pd.DataFrame( np.random.rand(3,4) )
df.ix[:,2] = NA
df.ix[0,2:] = NA
df.ix[2,0] = NA
df

In [None]:
# Drop entire row/column (axis) / how="all" or "any"
df.dropna(axis=1, how="all")

In [None]:
# Filling in data

df.fillna( 1000 )

In [None]:
# Or fill in with data from the dataframe
# Fill data per row or column (axis) 
# and forward / backward fill (method)
df.fillna( axis=1, method="ffill")

In [None]:
# Use a mean value (of each column)
df.fillna(df.mean())

In [None]:
df

## Data Reading / Writing

There are easy ways to read/write dataframes. Read the manual for all the possible
arguments that you can provide to easy parsing the data.

  * read_csv
  * read_excel
  * read_clipboard
  
  * to_csv
  * to_excel
  * to_clipboard

In [None]:
# On Linux/Msc: !cat 'csv_example.csv'
!type csv_example.csv

In [None]:
# Also supports URLs directly.
df = pd.read_csv('csv_example.csv', delimiter=',', skipinitialspace=True)
df

In [None]:
# Handy if you copy from the web
pd.read_clipboard(sep=",")

In [None]:
df.to_csv()

## Data Wrangling (Ch 7)

Combining and merging data sets

  * merge
  * concat
  

In [None]:
city_names = ['Amsterdam','Utrecht','Haarlem','Maastricht',
              'Rotterdam','Groningen','Assen','The Hague']
province_names = ['NH','UT','NH','LI','ZH','GR','DR','ZH']


cities = pd.DataFrame( { 'City':city_names, 'Province':province_names })
cities

In [None]:
df2 = pd.DataFrame( {'Name':['Alice', 'Steven', 'Karen'],
                     'City':['Haarlem','Assen','Rotterdam'],
                     'Age':[23,33,40]}, columns=["Name","City","Age"])
df2

In [None]:
# Concat will concatenate 2 dataframes
# Watch the index!
people = pd.concat( [df,df2], ignore_index=True, axis=0)
people

Merge will combine 2 dataframes by combining rows from both dataframes if they match
on a certain column value (or set of column values).

What happens to rows in either dataframe if no matching row is found depends on the "how" argument:
  * inner: only rows that match will be in the result (default)
  * outer: rows in either dataframe that don't match are also included in the result (with NaN)
  * left: rows in the left dataframe that don't match are also included in the result
  * right: rows in the right dataframe that don't match are also included in the result


In [None]:
# We can combine people and cities... merge
# on - what columns should match?
# how - what should be done with non-matching entries? (left, right, outer, inner)
pd.merge( people, cities, on="City", how="outer" )


## Plotting (Ch 8)

matplotlib can create plots. However, Series and DataFrames provide shortcuts
to calling matplotlib.

  * Series.plot
  * Dataframe.plot
  
Any arguments are passed through to matplotlib.


In [None]:
s = pd.Series( np.random.rand(10).cumsum() )
s.index.name="X-Axis-label"
s.plot(kind="line", title="Series Plot", label="line", legend=True )

In [None]:
df = pd.DataFrame( np.random.randn(10,4).cumsum(axis=0), columns=list("ABCD"),
    index=np.arange(0,100,10))
df.index.name="Index Name"
df.columns.name="Column Names"
df

In [None]:
df.plot(kind="line", style="-o", title="DataFrame Plot")

## Example: students and grades again



In [None]:
# students and their studnr
names=['John','Mary','Jane','Anne','Peter','Bob','Carol','Zach','David','Iris','Sarah','Charlie','Eve','George']
studnr=np.random.randint(100000, 999999, size=(len(names)))

student = pd.DataFrame( {'Name':names, 'Studnr':studnr })

In [None]:
student

In [None]:
# Function to generate a set of grades
def gen_grades( ids, nrgrades, course ):
    # Generate a list of indexes of ids for which we generate grades
    ids_index = np.random.choice(len(ids), size=nrgrades, replace=False )

    # Find the corresponding studnr
    chosen_ids = ids[ ids_index ]

    # Generate some grades for the exam and retake
    grades=[ float("%.1f" % g) for g in np.random.rand(nrgrades)*6 + 4 ]

    grade_table = pd.DataFrame( {'Studnr':chosen_ids, 'Course':course, 'Grade': grades }, 
                    columns=['Studnr', 'Course', 'Grade'])

    return grade_table

In [None]:
# Generate 4 test grade results for three courses
all_grades = []

for course in ['EPA101', 'EPA202', 'EPA303']:
    for test in ['Test1','Test2','Test3','Test4']:
        grades = gen_grades( studnr, 12, course)
        grades.columns = ['Studnr','Course', test]
    
        all_grades.append(grades)

In [None]:
len(all_grades)

In [None]:
all_grades[0]

In [None]:
# Merge the test results of all courses into one DataFrame
all_grades[0].merge(all_grades[1],how="outer").\
    merge(all_grades[2],how="outer").\
    merge(all_grades[3],how="outer")


In [None]:
# Let's do them all
grades_per_course = []

for i in range(0,9,4):
    g = all_grades[i].merge(all_grades[i+1],how="outer").\
        merge(all_grades[i+2],how="outer").\
        merge(all_grades[i+3],how="outer")
    grades_per_course.append(g)
    

In [None]:
len(grades_per_course)

In [None]:
grades_per_course[0]

In [None]:
# Now append them all together (concatenate)
result = pd.concat( grades_per_course, ignore_index=True)
result.tail(10)

In [None]:
result[result.Course=='EPA101']

In [None]:
# Merge the 2 student names and grades together
grades=pd.merge(student,result,how="left")
grades.tail(10)

In [None]:
# Add a column with the mean of the tests
grades['Final'] = grades.ix[:,'Test1':'Test4'].mean(axis=1)
grades.head(10)

In [None]:
# Create extra columne 'Passed'
grades['Passed'] = np.where( grades['Final']>=5.8, True, False)
grades.head(10)

In [None]:
# How many passed/failed?
pd.value_counts(grades['Passed'])

In [None]:
grades['Final'].mean()

In [None]:
grades['Final'].plot()