In [2]:
import os

import pandas as pd
from pandas import Series, DataFrame

import numpy as np
import seaborn as sns

%pylab inline

Populating the interactive namespace from numpy and matplotlib


### 1. Importing Data

pandas has functions such as **`read_csv, read_table, read_fwf`** and **`read_clipboard`** for reading tabular data as a DataFrame object. These functions take as arguments the following options:

* Which columns to consider?   
    * Import the header (`header=None`) or provide column names (`names=`)

* Type inference and conversion
    * Processing dates, combining date and time

* Which column serves as the index? (`index_col=`)
    * For a hierarchical index, pass a list of column names
    
* Which values to interpret as missing data (`na_values=`)
    * If there are multiple sentinels for missing data, pass a dictionary

* If the file is too large, read chunks iteratively (`nrows=` and `chunksize=`)

* Skipping over rows/footer (`skiprows=`)

* Interpreting decimal numbers (points or commas to mark thousands)

In [3]:
os.getcwd()
path = os.getcwd()

In [4]:
titanic = pd.read_csv(path + '/train.csv')

In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


-------------------------------------------------------------------------------------------------------------------------------

In [6]:
pd.merge?

In [None]:
from IPython.display import Image
Image("https://adiyatmubarak.files.wordpress.com/2016/01/sql.jpg")


# 7.1 Merge
pandas.merge is similar to the _SQL join_ operations; it links rows of tables using one or more _keys_

Syntax:

`merge(df1, df2, 
       how='left', on='key', left_on=None, right_on=None, 
       left_index=False, right_index=False, 
       sort=True, copy=True,
       suffixes=('_x', '_y'))`


The syntax includes specifications of the following arguments

* **Which column to merge on;** 
    * the `on='key'` if the same key is in the two DFs, 
    * or `left_on='lkey', right_on='rkey'` if the keys have different names in the DFs 
    * Note: To merge on multiple keys, pass a list of column names
 
 
* **The nature of the join;** 
    * the `how=` option, with `left`, `right`, `outer`
    * By default, the merge is an `inner` join
    
 
* Tuple of string values to append to **overlapping column names** to identify them in the merged dataset
    * the `suffixes=` option
    * defaults to `('_x', '_y')`
    
 
* If you wish **to merge on the DF index**, pass `left_index=True` or `right_index=True` or both.


* Sort the result DataFrame by the join keys in lexicographical order or not;
    * `sort=` option; Defaults to True, setting to False will improve performance substantially in many cases
    
    
    
> _Note:_ For the **official Documentation** refer http://pandas.pydata.org/pandas-docs/dev/merging.html

In [7]:
# Let's define a few toy datasets to use as examples

df0 = DataFrame({'key': ['a', 'b', 'c', 'd', 'e'], 'data0': np.random.randint(0, 100, 5)})
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': np.random.randint(0, 100, 7)})
df2 = DataFrame({'key': ['a', 'b', 'd', 'f', 'g'], 'data2': np.random.randint(0, 100, 5)})

df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data3': np.random.randint(0, 100, 7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data4': np.random.randint(0, 100, 3)})

print df0, '\n\n', df1, '\n\n', df2, '\n\n', df3, '\n\n', df4

   data0 key
0     56   a
1     35   b
2     29   c
3     12   d
4     96   e 

   data1 key
0     93   b
1     54   b
2     98   a
3     23   c
4     72   a
5     92   a
6     64   b 

   data2 key
0     62   a
1     14   b
2     40   d
3     52   f
4     74   g 

   data3 lkey
0     68    b
1     62    b
2     73    a
3     41    c
4     95    a
5     40    a
6     11    b 

   data4 rkey
0     71    a
1     91    b
2     44    d


In [8]:
# 1. Default merge with no parameters
print np.intersect1d(df0.key, df2.key)

print pd.merge(df0, df2)
# We see that its an inner join by default (output key is the intersection of input keys)
# Merge happens on the column 'key' which is common to both datasets;
    # We could've written pd.merge(df1, df2, on='key', how='inner') to the same effect

['a' 'b' 'd']
   data0 key  data2
0     56   a     62
1     35   b     14
2     12   d     40


In [9]:
# Left Join
print pd.merge(df0, df2, how='left')

   data0 key  data2
0     56   a   62.0
1     35   b   14.0
2     29   c    NaN
3     12   d   40.0
4     96   e    NaN


In [10]:
print np.intersect1d(df1.columns, df4.columns)

pd.merge(df1, df4)
# would yield an error because there are no matching column names to merge on 

[]


MergeError: No common columns to perform merge on

In [11]:
# 2. Specifying which columns to merge on (if keys have different names in datasets)

print np.intersect1d(df1.key, df4.rkey)

pd.merge(df1, df4, left_on='key', right_on='rkey')
# still an inner join!

['a' 'b']


Unnamed: 0,data1,key,data4,rkey
0,93,b,91,b
1,54,b,91,b
2,64,b,91,b
3,98,a,71,a
4,72,a,71,a
5,92,a,71,a


In [14]:
# 3. Specifying which type of join: outer
print set(df1.key.tolist() + df2.key.tolist())
print np.union1d(df1.key, df2.key)


pd.merge(df1, df2, how='outer')
# the merged dataset will have a union of the keys, imputing NaNs where values aren't found

set(['a', 'c', 'b', 'd', 'g', 'f'])
['a' 'b' 'c' 'd' 'f' 'g']


Unnamed: 0,data1,key,data2
0,93.0,b,14.0
1,54.0,b,14.0
2,64.0,b,14.0
3,98.0,a,62.0
4,72.0,a,62.0
5,92.0,a,62.0
6,23.0,c,
7,,d,40.0
8,,f,52.0
9,,g,74.0


In [17]:
# 3.1 Try out a left join
print df1.key.unique()
pd.merge(df1, df2, how='left')

# value 'c' is absent in df2, so there will be a NaN in column data2

['b' 'a' 'c']


Unnamed: 0,data1,key,data2
0,93,b,14.0
1,54,b,14.0
2,98,a,62.0
3,23,c,
4,72,a,62.0
5,92,a,62.0
6,64,b,14.0


In [23]:
# Add a column with the same name to df1 and df2
df1['col_new'] = np.random.randn(7)
df2['col_new'] = np.random.randn(5)

print df1
print '\n\n', df2, '\n\n'

   data1 key   col_new
0     93   b -1.680196
1     54   b  0.004271
2     98   a  0.517509
3     23   c  1.041373
4     72   a  1.534074
5     92   a  1.080713
6     64   b  1.005952


   data2 key   col_new
0     62   a -0.730144
1     14   b  0.163369
2     40   d  0.146430
3     52   f  1.533043
4     74   g  0.545806 




In [24]:
pd.merge(df1, df2, on='key', how='outer')

Unnamed: 0,data1,key,col_new_x,data2,col_new_y
0,93.0,b,-1.680196,14.0,0.163369
1,54.0,b,0.004271,14.0,0.163369
2,64.0,b,1.005952,14.0,0.163369
3,98.0,a,0.517509,62.0,-0.730144
4,72.0,a,1.534074,62.0,-0.730144
5,92.0,a,1.080713,62.0,-0.730144
6,23.0,c,1.041373,,
7,,d,,40.0,0.14643
8,,f,,52.0,1.533043
9,,g,,74.0,0.545806


In [25]:
# Specifying suffixes to identify columns with the same name
print pd.merge(df1, df2, on='key', suffixes=['_df1', '_df2'])

   data1 key  col_new_df1  data2  col_new_df2
0     93   b    -1.680196     14     0.163369
1     54   b     0.004271     14     0.163369
2     64   b     1.005952     14     0.163369
3     98   a     0.517509     62    -0.730144
4     72   a     1.534074     62    -0.730144
5     92   a     1.080713     62    -0.730144


In [26]:
df1['key2'] = list('xy' * 3 + 'z')
df2['key2'] = list('xy' * 2 + 'z')
print df1
print '\n\n', df2, '\n\n'

   data1 key   col_new key2
0     93   b -1.680196    x
1     54   b  0.004271    y
2     98   a  0.517509    x
3     23   c  1.041373    y
4     72   a  1.534074    x
5     92   a  1.080713    y
6     64   b  1.005952    z


   data2 key   col_new key2
0     62   a -0.730144    x
1     14   b  0.163369    y
2     40   d  0.146430    x
3     52   f  1.533043    y
4     74   g  0.545806    z 




In [27]:
print zip(df1.key, df1.key2)
print zip(df2.key, df2.key2)

[('b', 'x'), ('b', 'y'), ('a', 'x'), ('c', 'y'), ('a', 'x'), ('a', 'y'), ('b', 'z')]
[('a', 'x'), ('b', 'y'), ('d', 'x'), ('f', 'y'), ('g', 'z')]


In [28]:
pd.merge(df1, df2, on=['key', 'key2'], suffixes=('_1', '_2'))

Unnamed: 0,data1,key,col_new_1,key2,data2,col_new_2
0,54,b,0.004271,y,14,0.163369
1,98,a,0.517509,x,62,-0.730144
2,72,a,1.534074,x,62,-0.730144


In [None]:
df3.reset_index()

In [None]:
# Set lkey to be the index of df3

df3.set_index('lkey', inplace=True)
print df3

# Note: Do this only once. Re-running set_index will produce errors. You'll have to reset index before you can set it again.

In [None]:
df2

In [None]:
# We specify that for the left df we will use the column called 'key' and for the right df, we will use its index to merge
pd.merge(df2, df3, how='left', left_on='key', right_index=True)

------------------------------------------------------------------------------------------------------------------------------

## Merging Multiple DataFrame at once

In [29]:
df0 = DataFrame({'key': ['a', 'b', 'c', 'd', 'e'], 'data0': np.random.randint(0, 100, 5)})
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': np.random.randint(0, 100, 7)})
df2 = DataFrame({'key': ['a', 'b', 'd', 'f', 'g'], 'data2': np.random.randint(0, 100, 5)})
df3 = DataFrame({'key': ['b', 'x', 'a', 'c', 'a', 'a', 'b'], 'data3': np.random.randint(0, 100, 7)})
df4 = DataFrame({'key': ['y', 'b', 'd', 'f', 'a'], 'data4': np.random.randint(0, 100, 5)})

In [30]:
# Option 1 : Nesting
pd.merge(df2, pd.merge(df0, df1))

Unnamed: 0,data2,key,data0,data1
0,92,a,24,39
1,92,a,24,32
2,92,a,24,30
3,33,b,94,14
4,33,b,94,16
5,33,b,94,66


In [31]:
reduce(lambda x, y: x + y, range(1, 11))

55

In [50]:
reduce(lambda x, y: pd.merge(x, y, how='left'), [df0, df1, df2, df3, df4])

Unnamed: 0,data0,key,data1,data2,data3,data4
0,24,a,39.0,92.0,76.0,81.0
1,24,a,39.0,92.0,95.0,81.0
2,24,a,39.0,92.0,77.0,81.0
3,24,a,32.0,92.0,76.0,81.0
4,24,a,32.0,92.0,95.0,81.0
5,24,a,32.0,92.0,77.0,81.0
6,24,a,30.0,92.0,76.0,81.0
7,24,a,30.0,92.0,95.0,81.0
8,24,a,30.0,92.0,77.0,81.0
9,94,b,14.0,33.0,61.0,19.0


<big>Merge - Pandas function <br>
JOIN - DataFrame Method (used to join multiple series/dataframe together)

In [36]:
DataFrame.join()

In [51]:
# %timeit 
df0.set_index('key').join(map(lambda df: df.set_index('key'), [df1, df2, df3, df4]))

Unnamed: 0_level_0,data0,data1,data2,data3,data4
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,24.0,39.0,92.0,76.0,81.0
a,24.0,39.0,92.0,95.0,81.0
a,24.0,39.0,92.0,77.0,81.0
a,24.0,32.0,92.0,76.0,81.0
a,24.0,32.0,92.0,95.0,81.0
a,24.0,32.0,92.0,77.0,81.0
a,24.0,30.0,92.0,76.0,81.0
a,24.0,30.0,92.0,95.0,81.0
a,24.0,30.0,92.0,77.0,81.0
b,94.0,14.0,33.0,61.0,19.0


In [None]:
print type(pd.merge)
print type(DataFrame.join)

In [None]:
DataFrame.join?

# 7.2 The `.join()` method

.join is a convenient **DataFrame method** for combining many DataFrames objects with the same or similar indexes but non-overlapping columns into a single result DataFrame.

By default, the `join` method performs a _left join_ on the join keys.

For simple **index-on-index merges** we can pass a list of DataFrames to `join.`

In [None]:
df = DataFrame(np.random.randint(0, 50, 32).reshape(8, 4), columns=list('WXYZ'), index=list('abcdefgh'))

df1 = df.ix[2:, ['W', 'X']]
df2 = df.ix[:5, ['Y', 'Z']]

print df1, '\n\n', df2

In [None]:
# Default actions is a left join on the indexes
df1.join(df2)

In [None]:
%timeit df1.join(df2)

In [None]:
pd.merge(df1, df2, how='left', right_index=True, left_index=True)

In [None]:
%timeit pd.merge(df1, df2, how='left', right_index=True, left_index=True)

In [None]:
# We can alter the nature of the join by passing how=
print df1.join(df2, how='outer')

In [None]:
# Create a couple more DFs with the same index
df3 = df.ix[0:3, ['X', 'Z']]
df3.columns = ['P', 'Q']

df4 = df.ix[4:6, ['W']]
df4.columns = ['R']

print df3, "\n\n", df4

In [None]:
print df1, '\n\n', df2

In [None]:
# Merging multiple DFs with the same index by passing a list of names to .join
print df1.join([df2, df3, df4]).fillna('')

In [None]:
df2.join([df1, df3, df4], how='outer').fillna('')

----

<big>Task 1.</big>

Use join on these.


```python
df_1 = titanic[['Name', 'Age', 'Sex']]
df_2 = titanic[['Name', 'Pclass', 'Fare']]
```

---

<big>Task 2. <br><br>

Define a function called JOINER which accepts any number of dataframes and joins them. <br>
Your function must not fail (no errors can be produced) under any circumstances.

</big>

In [55]:
df_titanic = pd.read_csv('train.csv')
df_1 = df_titanic[['Name', 'Age', 'Sex']]
df_2 = df_titanic[['Name', 'Pclass', 'Fare']]

In [58]:
df_1.set_index('Name').join(df_2.set_index('Name'))[:5]

Unnamed: 0_level_0,Age,Sex,Pclass,Fare
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Braund, Mr. Owen Harris",22.0,male,3,7.25
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",38.0,female,1,71.2833
"Heikkinen, Miss. Laina",26.0,female,3,7.925
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,female,1,53.1
"Allen, Mr. William Henry",35.0,male,3,8.05


In [67]:
def JOINER(*args, **kwargs):
    """
    """
    joined =[]
    col_list = [x.columns.tolist() for x in args]
    pk = reduce(lambda x, y: np.intersect1d(x, y), col_list).tolist()
    HOW = kwargs['HOW']
    
    if bool(pk):
        list_of_dfs = map(lambda df: df.set_index(pk), args)
        joined = list_of_dfs[0].join(list_of_dfs[1:], how=HOW)    
    else:
        print "There are no common columns to join."
            
    return joined

In [70]:
JOINER(df0, df1, df2, df3, df4, HOW='outer')

Unnamed: 0_level_0,data0,data1,data2,data3,data4
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,24.0,39.0,92.0,76.0,81.0
a,24.0,39.0,92.0,95.0,81.0
a,24.0,39.0,92.0,77.0,81.0
a,24.0,32.0,92.0,76.0,81.0
a,24.0,32.0,92.0,95.0,81.0
a,24.0,32.0,92.0,77.0,81.0
a,24.0,30.0,92.0,76.0,81.0
a,24.0,30.0,92.0,95.0,81.0
a,24.0,30.0,92.0,77.0,81.0
b,94.0,14.0,33.0,61.0,19.0


In [86]:
df0 = DataFrame({'key': list('aaabcde'), 'vals1': np.random.randint(0, 10, 7)})
df1 = DataFrame({'key': list('aabbcc'), 'vals2': np.random.randint(0, 10, 6)})

In [87]:
df0

Unnamed: 0,key,vals1
0,a,2
1,a,6
2,a,0
3,b,4
4,c,8
5,d,3
6,e,7


In [88]:
df1

Unnamed: 0,key,vals2
0,a,6
1,a,5
2,b,9
3,b,0
4,c,8
5,c,1


In [89]:
pd.merge(df0, df1, on='key')

Unnamed: 0,key,vals1,vals2
0,a,2,6
1,a,2,5
2,a,6,6
3,a,6,5
4,a,0,6
5,a,0,5
6,b,4,9
7,b,4,0
8,c,8,8
9,c,8,1


------------------------------------------------------------------------------------------------------------------------------

In [None]:
pd.concat?

# 7.3 Concatenating DataFrames 
## - (aka binding, stacking, union all)

### a. Series objects with no index overlap
    * concat with axis=0 (default) will append the Series (~rbind)
    * concat with axis=1 will merge the Series to produce a DF (~outer join)

In [90]:
# Create toy Series with overlapping indices
s1 = Series(np.random.randn(4), index=list('abcx'), name='chintu')
s2 = Series(np.random.randn(6), index=list('ycdefg'), name='sonu')
s3 = Series(np.random.randn(5), index=list('cfghi'), name='monu')

print s1, '\n\n S2:\n', s2, '\n\n S3:\n', s3

a   -0.352807
b   -0.761518
c   -1.030246
x    0.339533
Name: chintu, dtype: float64 

 S2:
y   -1.601997
c    1.387587
d    0.414205
e    0.532228
f   -1.037770
g    0.017787
Name: sonu, dtype: float64 

 S3:
c   -0.805904
f    1.891518
g   -0.137295
h   -0.740409
i   -0.467219
Name: monu, dtype: float64


In [91]:
# Default action is to append the data
pd.concat([s1, s2, s3], axis=0)

a   -0.352807
b   -0.761518
c   -1.030246
x    0.339533
y   -1.601997
c    1.387587
d    0.414205
e    0.532228
f   -1.037770
g    0.017787
c   -0.805904
f    1.891518
g   -0.137295
h   -0.740409
i   -0.467219
dtype: float64

In [92]:
# concat with axis=1 (non-overlapping index)
print pd.concat([s1, s2, s3], axis=1)

     chintu      sonu      monu
a -0.352807       NaN       NaN
b -0.761518       NaN       NaN
c -1.030246  1.387587 -0.805904
d       NaN  0.414205       NaN
e       NaN  0.532228       NaN
f       NaN -1.037770  1.891518
g       NaN  0.017787 -0.137295
h       NaN       NaN -0.740409
i       NaN       NaN -0.467219
x  0.339533       NaN       NaN
y       NaN -1.601997       NaN


In [None]:
pd.Series.jo

---

<big> TASK 3 ||  Write a version of the `concat` function that fails gracefully. 

---

### The `keys=` option

In [93]:
# Passing keys= creates a hierarchical index when appending (axis=0)
pd.concat([s1, s2, s3], axis=0, keys=[s.name for s in [s1, s2, s3]])

chintu  a   -0.352807
        b   -0.761518
        c   -1.030246
        x    0.339533
sonu    y   -1.601997
        c    1.387587
        d    0.414205
        e    0.532228
        f   -1.037770
        g    0.017787
monu    c   -0.805904
        f    1.891518
        g   -0.137295
        h   -0.740409
        i   -0.467219
dtype: float64

In [94]:
# Passing keys= gives names to columns when using axis=1
print pd.concat([s1, s2, s3], axis=1, keys=[s.name[0].upper() for s in [s1, s2, s3]])

          C         S         M
a -0.352807       NaN       NaN
b -0.761518       NaN       NaN
c -1.030246  1.387587 -0.805904
d       NaN  0.414205       NaN
e       NaN  0.532228       NaN
f       NaN -1.037770  1.891518
g       NaN  0.017787 -0.137295
h       NaN       NaN -0.740409
i       NaN       NaN -0.467219
x  0.339533       NaN       NaN
y       NaN -1.601997       NaN


### b. Series objects with overlapping index

* If there is an overlap on indexes, we can specify **`join=`** to intersect the data
    * Note that the `join=` option takes only `'inner'` and `'outer'`

In [95]:
s4 = Series(np.random.randn(5), index=list('abcde'), name='S4')
print s4

a   -1.668432
b   -0.139904
c    0.262970
d    1.436704
e   -0.404103
Name: S4, dtype: float64


In [96]:
s1

a   -0.352807
b   -0.761518
c   -1.030246
x    0.339533
Name: chintu, dtype: float64

In [97]:
# concat with overlapping index (default join type is outer)
print pd.concat([s1, s4], axis=1)

     chintu        S4
a -0.352807 -1.668432
b -0.761518 -0.139904
c -1.030246  0.262970
d       NaN  1.436704
e       NaN -0.404103
x  0.339533       NaN


In [98]:
# if we specify a join type, this will be equivalent to a merge
print pd.concat([s1, s4], axis=1, join='inner')

     chintu        S4
a -0.352807 -1.668432
b -0.761518 -0.139904
c -1.030246  0.262970


### c. DataFrame objects

The same logic extends here

In [99]:
# Create toy dataframes with non-overlapping indexes
df1 = DataFrame(np.random.randn(3, 3), index=list('abc'), columns=list('XYZ')) 
df2 = DataFrame(np.random.randn(2, 2), index=list('pq'), columns=list('XZ'))
print df1, '\n\n', df2

          X         Y         Z
a  0.596043  0.140270 -0.217888
b -0.002343  0.858121  2.146465
c -0.571256  0.166585 -0.215421 

          X         Z
p  0.633566  0.112313
q  1.376006 -0.358565


In [100]:
# No overlapping index
print 'When axis=0 \n'
print pd.concat([df1, df2])

When axis=0 

          X         Y         Z
a  0.596043  0.140270 -0.217888
b -0.002343  0.858121  2.146465
c -0.571256  0.166585 -0.215421
p  0.633566       NaN  0.112313
q  1.376006       NaN -0.358565


In [101]:
print '\n When axis=1 \n'
print pd.concat([df1, df2], axis=1)

### NEVER DO THIS.


 When axis=1 

          X         Y         Z         X         Z
a  0.596043  0.140270 -0.217888       NaN       NaN
b -0.002343  0.858121  2.146465       NaN       NaN
c -0.571256  0.166585 -0.215421       NaN       NaN
p       NaN       NaN       NaN  0.633566  0.112313
q       NaN       NaN       NaN  1.376006 -0.358565


In [102]:
# Create toy dataframes with overlapping indexes
df1 = DataFrame(np.random.randn(9).reshape(3, 3), index=list('abc'), columns=list('XYZ')) 
df2 = DataFrame(np.random.randn(4).reshape(2, 2), index=list('ac'), columns=list('XZ'))
print df1, '\n\n', df2

# When axis=0 there will still be 
pd.concat([df1, df2])

          X         Y         Z
a  0.407096  0.591862 -0.293056
b -1.602185 -0.049404  1.402603
c -0.118487  1.522990 -0.697165 

          X         Z
a  0.308889  0.679124
c -1.147507 -0.148984


Unnamed: 0,X,Y,Z
a,0.407096,0.591862,-0.293056
b,-1.602185,-0.049404,1.402603
c,-0.118487,1.52299,-0.697165
a,0.308889,,0.679124
c,-1.147507,,-0.148984


In [103]:
# Overlapping indexes will be merged
pd.concat([df1, df2], axis=1)

Unnamed: 0,X,Y,Z,X.1,Z.1
a,0.407096,0.591862,-0.293056,0.308889,0.679124
b,-1.602185,-0.049404,1.402603,,
c,-0.118487,1.52299,-0.697165,-1.147507,-0.148984


In [104]:
pd.concat([df1, df2], axis=1, keys=['df_1', 'df_2'])
# This will create a hierarchical index

Unnamed: 0_level_0,df_1,df_1,df_1,df_2,df_2
Unnamed: 0_level_1,X,Y,Z,X,Z
a,0.407096,0.591862,-0.293056,0.308889,0.679124
b,-1.602185,-0.049404,1.402603,,
c,-0.118487,1.52299,-0.697165,-1.147507,-0.148984


------------------------------------------------------------------------------------------------------------------------------

# 7.4 $Reshaping$ using `stack()` and `unstack()`

Hierarchical Indexing provides a convenient way to reshape data;
    * `stack` pivots the columns into rows
    * `unstack` pivots rows into columns



In [105]:
# Create a toy DF with a Hierarchical Index
tuples = zip(list('AB'*2), list('CDEF'))
multix = pd.MultiIndex.from_tuples(tuples, names=['one', 'two'])

df = DataFrame(np.random.randn(4, 4), index=multix, columns=list('WXYZ')).round(2)
print df

            W     X     Y     Z
one two                        
A   C   -0.12 -0.20  2.16  0.72
B   D   -0.39 -0.10  0.31 -0.51
A   E   -1.28  0.01 -1.09 -0.11
B   F   -1.29  2.19  0.24  1.68


---
<big>
Use stack when you want to convert a DataFrame into a Series with a hierarchical index

In [106]:
df.stack()
# 1 column

one  two   
A    C    W   -0.12
          X   -0.20
          Y    2.16
          Z    0.72
B    D    W   -0.39
          X   -0.10
          Y    0.31
          Z   -0.51
A    E    W   -1.28
          X    0.01
          Y   -1.09
          Z   -0.11
B    F    W   -1.29
          X    2.19
          Y    0.24
          Z    1.68
dtype: float64

In [109]:
DataFrame(np.random.randint(1,50, 9).reshape(3, 3), 
          columns=list('abc'), 
          index=list('xyz')).stack()

x  a    17
   b    34
   c     3
y  a    45
   b     3
   c    15
z  a     1
   b    45
   c    49
dtype: int64

---
<big> Unstacked is useful when you have a Series with a Hierarchical Index, to convert it 
into a DataFrame

In [110]:
multix

MultiIndex(levels=[[u'A', u'B'], [u'C', u'D', u'E', u'F']],
           labels=[[0, 1, 0, 1], [0, 1, 2, 3]],
           names=[u'one', u'two'])

In [111]:
Series([1, 2, 3, 4], index=multix)

one  two
A    C      1
B    D      2
A    E      3
B    F      4
dtype: int64

-----------------------------------------------------------------------------------------------------------------------------

In [None]:
print type(DataFrame.pivot)
print type(pd.pivot_table)

In [None]:
DataFrame.pivot?

In [None]:
pd.pivot_table?

# 7.5 Converting data from 'long' to 'wide' format using `.pivot()`

Usually, for convenience, data in relational DB is stored in the **long format**
    * fewer columns, label duplication in keys

For certain kinds of analysis, we might prefer to have the data in the **wide format **
    * more columns, unique labels in keys

The `df.pivot()` method takes the names of columns to be used as row (`index=`) and column indexes (`columns=`) and a column to fill in the data as (`values=`)


In [117]:
df = DataFrame({'date': (list(pd.date_range('2000-01-03', '2000-01-05')) * 4),
          'item': (list('ABCD'*3)),
          'status': (np.random.randn(12))})
print df

         date item    status
0  2000-01-03    A -1.756575
1  2000-01-04    B  0.163075
2  2000-01-05    C -0.372767
3  2000-01-03    D -1.751664
4  2000-01-04    A -1.299162
5  2000-01-05    B  0.107179
6  2000-01-03    C  1.314607
7  2000-01-04    D -0.908350
8  2000-01-05    A  0.910836
9  2000-01-03    B  0.689691
10 2000-01-04    C -0.440248
11 2000-01-05    D -1.098606


In [120]:
print df.set_index(['date', 'item']).unstack()

              status                              
item               A         B         C         D
date                                              
2000-01-03 -1.756575  0.689691  1.314607 -1.751664
2000-01-04 -1.299162  0.163075 -0.440248 -0.908350
2000-01-05  0.910836  0.107179 -0.372767 -1.098606


In [121]:
print df.pivot(index='date', columns='item', values='status')

item               A         B         C         D
date                                              
2000-01-03 -1.756575  0.689691  1.314607 -1.751664
2000-01-04 -1.299162  0.163075 -0.440248 -0.908350
2000-01-05  0.910836  0.107179 -0.372767 -1.098606


In [None]:
%timeit df.set_index(['date', 'item']).unstack()

In [None]:
%timeit df.pivot(index='date', columns='item', values='status')

> Note: Pivot is just a convenient wrapper function that replaces the need to create a hierarchical index using `set_index` and reshaping with `stack`

In [None]:
df_p = pd.concat([df, df.assign(status = lambda x: x['status'] * 2)])
print df_p

In [None]:
print pd.pivot_table(data=df_p, 
               index='date', 
               columns='item', 
               values='status', 
               aggfunc=np.sum)

-----------------------------------------------------------------------------------------------------------------------------

# 7.6 $Transforming$ Data

## A. Removing Duplicates

* `df.duplicated()` Returns boolean Series denoting duplicate rows, optionally only considering certain columns
* `df.drop_duplicates()` Returns DataFrame with duplicate rows removed, optionally only considering certain columns


In [None]:
df = DataFrame({'C1': list('ABC' * 2),
          'C2': [1, 2, 4, 3, 2, 4]})
print df

In [None]:
print df.assign(Dups = df.duplicated())
# Creates a boolean series to indicate which rows have dups

In [None]:
print df[df.duplicated()]
# Retain the rows that are duplicates

In [None]:
print df[-df.duplicated()]

In [None]:
print df.drop_duplicates()
# retain the first occurrence of each row (drop dups)

In [None]:
print df.drop_duplicates(keep='last')
# retain the last occurrence of each row (drop dups)

In [None]:
print df.drop_duplicates(keep=False)

### To find number of duplicated rows

In [None]:
titanic.duplicated().value_counts()

In [None]:
df.duplicated().value_counts()

In [None]:
titanic[['Sex', 'Pclass', 'Embarked']].duplicated().value_counts()

> By default, these methods consider all of the columns. To specify a subset for detecting duplicates, use **`df.drop_duplicates(['list-of-columns'])`**

-----------------------------------------------------------------------------------------------------------------------------

## B. Replacing Values in a Categorical Column

For adding a column based on the transformed values of an existing column (using a lookup table) involves calling the `.map()` method (for Series) which accepts a `dict` or a `function` and applies it to each value.

> Note: `.map()` is a convenient way to perform element-wise transformations and data cleaning operations.

In [None]:
df = DataFrame({'key': list('ABC' * 4),
               'val': np.random.randint(30, 80, 12)}); print df

In [None]:
lookup = {
    'A': 'Excellent',
    'B': 'Satisfactory',
    'C': 'Improve'
}

lookup

In [None]:
# Chain single replacements
df['key'].replace('A', 'Excellent').replace('B', 'Satisfactory').replace('C', 'Improve')

In [None]:
df['key'].replace(lookup)

In [None]:
df['key'].map(lookup)

In [None]:
df['grade'] = df['key'].map(lookup)
df

In [None]:
df['grade_2'] = df['key'].replace(lookup)

In [None]:
df

In [None]:
%timeit df.key.replace(lookup)

In [None]:
%timeit df.key.map(lookup)

## C. Replacing Values

To substitute certain values in a Series by a target-value, we can use the `.replace()` method,
specifying the find/replace (target/replacement) values as a list or a dict.

In [None]:
s = Series(list('abc' * 3))

s[3] = None

**Syntax: `my_series.replace(target-value, replace_by_this)`**


In [None]:
s

In [None]:
s.replace('a', 'AA')

In [None]:
s[::2] = np.nan

In [None]:
s

In [None]:
s.replace(np.nan, 'nan')

In [None]:
s.fillna('nan')

---

In [None]:
df_r = DataFrame(np.random.randint(0, 5, 25).reshape(5, 5), columns=list('ABCDE'))

In [None]:
df_r

In [None]:
df_r.replace([0, 1, 2, 3, 4], ['zer','one', 'two', 'thr', 'four'])

In [None]:
lookup = {0: 'zer', 
          1: 'one', 
          2: 'two', 
          3: 'thr', 
          4: 'fou'}

df_r.applymap(lambda x: lookup.get(x))

In [None]:
%timeit df_r.replace([0, 1, 2, 3, 4], ['zer','one', 'two', 'thr', 'four'])

In [None]:
%timeit df_r.applymap(lambda x: lookup[x])

-----------------------------------------------------------------------------------------------------------------------------

In [None]:
pd.cut?

## D. Binning Numeric Variables to Categoricals

The `pd.cut()` and `pd.qcut()` functions are used; they take as arguments the following;

* `var`, the continuous variable to discretize
* `bins`, specified as a number (equal sized bins will be computed based on min/max) or a list of bin edges
* `right=True`, a boolean to include the edge or not
* `labels=`, for naming the bins
* `precision=`

In [None]:
# Create a list of 20 integers between 1 and 100
var = np.random.randint(1, 100, 500)
print var[:10]

In [None]:
zip(Series(var)[:10], pd.cut(Series(var), 10)[:10])

In [None]:
zip(Series(var)[:10], pd.cut(Series(var), bins=range(0, 101, 20))[:10])

In [None]:
# Automatic Binning
pd.cut(var, bins=5)

In [None]:
pd.cut(var, bins=5).value_counts()

In [None]:
type(pd.cut(var, bins=5))

In [None]:
zip(var, pd.cut(var, 
                bins=range(0, 101, 20), 
                right=False,
                labels=['Bin_' + str(x) for x in range(5)]))[:5]

In [None]:
pd.concat([Series(var[:10], name='Values'), 
           Series(pd.cut(var, bins=[0, 33,  66, 100], 
                         labels=['0-33', '34-66', '67-100'])[:10], name='Bins')], axis=1)

In [None]:
pd.cut(var, 3, labels=['one', 'two', 'three'], retbins=True)

In [None]:
pd.cut(var, [0, 25, 50, 75, 100]).value_counts()

In [None]:
pd.cut(var, [0, 25, 50, 75, 100]).value_counts().plot.barh(figsize=(4, 2));

## Cutting variables drawn from a known distribution

In [None]:
Series(np.random.exponential(0.5, 10000)).plot.hist(bins=30, figsize=(3 ,3));

In [None]:
pd.cut(np.random.exponential(0.5, 10000), 15, right=False).value_counts().plot.bar(figsize=(5, 3));

We find that `.cut(data, bins)` automatically bins values by splitting the range into equal-sized bins.

As a result, the distribution is not uniform.

This is where `qcut` comes in.

#### E. Binning into quantiles

In [None]:
pd.qcut(np.random.normal(1, 1, 100000), 10).value_counts().plot.bar(figsize=(3, 3));

---
<big>

- `pd.cut` -> same distributions as the underlying data
- `pd.qcut` -> uniform distribtion

-----------------------------------------------------------------------------------------------------------------------------

In [None]:
DataFrame.sample()

In [None]:
Series.sample()

## F. Random Sampling

We can use the `np.random.permutation` function (passing nrows as an argument) for randomly reordering a Series.

To select a random sample, create an index and subset the DF using it.
* **Without replacement**: slice off the first _k_ rows; where _k_ is the size of the subset you desire
* **With replacement**: use `np.random.randint(start, stop, size=)` to draw integers at random

In [122]:
df = DataFrame(np.random.randn(1000, 5), columns=list('ABCDE')).round(2)
df[:10]

Unnamed: 0,A,B,C,D,E
0,0.08,-2.21,-0.51,-1.54,-0.54
1,-1.39,-0.51,-0.62,0.11,-0.68
2,-0.23,-0.55,-0.42,0.16,0.41
3,0.28,1.27,1.59,0.56,-0.04
4,2.38,-1.76,-1.17,-0.17,-1.2
5,-0.73,0.25,1.89,0.7,1.79
6,0.83,-1.4,-0.1,-1.08,0.46
7,-0.56,-0.24,0.88,-0.95,0.06
8,0.26,2.29,-0.37,0.14,-0.13
9,0.14,0.72,-0.85,0.5,1.5


In [123]:
df.shape

(1000, 5)

In [139]:
s_1k = Series(np.random.permutation(1000))
df.iloc[s_1k[:10], :]

Unnamed: 0,A,B,C,D,E
81,0.01,-0.19,-1.0,0.09,-0.01
971,-0.11,-0.56,0.68,-0.23,0.41
614,1.57,-0.36,0.15,0.58,0.32
130,0.59,1.35,1.67,0.57,1.17
505,0.14,0.53,-0.11,-1.23,-0.54
803,-1.48,-0.35,0.48,0.98,0.55
384,-0.21,0.39,-0.04,-1.97,0.07
956,0.18,0.6,-0.19,-0.85,-0.06
860,-2.04,0.32,0.42,1.61,-0.69
724,-0.57,-0.47,0.54,0.63,-0.11


In [146]:
df.iloc[s_1k[:int(len(df) * 0.01)], :]

Unnamed: 0,A,B,C,D,E
81,0.01,-0.19,-1.0,0.09,-0.01
971,-0.11,-0.56,0.68,-0.23,0.41
614,1.57,-0.36,0.15,0.58,0.32
130,0.59,1.35,1.67,0.57,1.17
505,0.14,0.53,-0.11,-1.23,-0.54
803,-1.48,-0.35,0.48,0.98,0.55
384,-0.21,0.39,-0.04,-1.97,0.07
956,0.18,0.6,-0.19,-0.85,-0.06
860,-2.04,0.32,0.42,1.61,-0.69
724,-0.57,-0.47,0.54,0.63,-0.11


## -  Without Replacement using `permutation`

In [None]:
# Create a randomized index equal to the length of the DF
sample = np.random.permutation(len(df))

# Subset it to retain only the desired number of cases
train = sample[:np.around(len(df) * 0.7)]

In [None]:
len(train)

In [None]:
# Index the DF using this
df_TRAIN = df.loc[train]
print len(train), '\n', df_TRAIN.head()


## - With Replacement using `randint`

In [None]:
Series(np.random.randint(1, 1000, 500)).value_counts()[:5]

In [None]:
# WITH REPLACEMENT
repl = np.random.randint(0, 1000, 700)
Series(repl).value_counts()[:10]

In [None]:
# sample with duplicate rows
# df.ix[repl]

In [None]:
df.ix[repl].duplicated().value_counts()

## Sampling using `.sample()` method

In [161]:
# WIthout replacement
df.sample(n=700, replace=False).duplicated().value_counts()

False    700
dtype: int64

In [164]:
# WIth replacement
df.sample(frac=0.7, replace=True).duplicated().value_counts()

False    495
True     205
dtype: int64

## G. Create Dummies for a Categorical Variable
Create a (n x k) matrix of binary variables from a categorical variable of length n with k levels.

`pd.get_dummies(var)` does this.

In [None]:
df_G = DataFrame({'key': list('bbacccb'),
                 'val': np.random.randn(7)})
print df_G

In [None]:
print (DataFrame({'key': df_G['key']}).assign(dummy_a = lambda x: [int(i=='a') for i in x['key']],
                                              dummy_b = lambda x: [int(i=='b') for i in x['key']],

                                              dummy_c = lambda x: [int(i=='c') for i in x['key']]))

In [None]:
# automatic
# one categorical column -> dataframe of dummies

print pd.get_dummies(df_G['key'], prefix='dummy').drop('dummy_c', axis=1)

In [None]:
int(True), int(False)

In [None]:
df_G

In [None]:
# Create and merge dummies in the same DF
(df_G
 .join(pd.get_dummies(df_G['key'], prefix='dummy'))
 .drop(['key', 'dummy_c'], axis=1))

In [None]:
# Create a categorical variable from a numeric and then compute dummies
df_G.val = np.random.rand(7)
df_G

In [None]:
zip(df_G['val'], pd.cut(df_G['val'], 3, labels=list('XYZ')))

In [None]:
pd.get_dummies(pd.cut(df_G['val'], 3, labels=list('XYZ')), prefix='dummy')

In [None]:
len(np.random.randint(1, 100, 10**4))
len(np.random.randint(10**3, 10**5, 10**4))

---
<big> Task 1

- Create a dataframe with 2 variables called 'Age' and 'Income'. 
- Fill these with random integers between (1, 100) and (10k to 100k) for 'Income'.  Use 10k rows.
- Use cut to bin Age into 5 bins.
- Use qcut to bin Income in to 10 bins. 
- Assign meaningful labels to each.
- Convert both these cut variables into Dummies.
- Report the mean and sum of each dummy variable.

---

In [None]:
df_x = DataFrame({'Age': np.random.randint(1, 100, 6543),
                 'Income': np.random.randint(10000, 100000, 6543)})

In [None]:
df_x = df_x.assign(Age_binned = lambda x: pd.cut(x['Age'], 5, labels=['AgeGrp_' + str(i + 1) for i in range(5)]),
                   Inc_binned = lambda x: pd.cut(x['Income'], 10, labels=['IncGrp_' + str(i + 1) for i in range(10)])
                  )

In [None]:
DataFrame({'Age_Sum': pd.get_dummies(df_x['Age_binned']).sum(), 
 'Age_Mean': pd.get_dummies(df_x['Age_binned']).mean()})

In [None]:
DataFrame({'Inc_Sum': pd.get_dummies(df_x['Inc_binned']).sum(), 
 'Inc_Mean': pd.get_dummies(df_x['Inc_binned']).mean()})

### NUMERIC to CATEGORICAL (via binning) to DUMMIES (via dummification)

In [None]:
df_G.join(pd.get_dummies(pd.cut(df_G['val'], 3, labels=list('XYZ')), 
                         prefix='dummy')).drop('key', axis=1)

Read: **DUMMY VARIABLE TRAP**

-----------------------------------------------------------------------------------------------------------------------------

## H. String Methods

These include methods applied to string objects that 
* split a string by given delimiter - `.split()`
* trim whitespace - `.strip()`
* concatenate strings - `.join()`
* detect substrings - `.find()` and `.index()`
* count occurrences - `.count()`
* find and replace - `.replace()`


In [None]:
s = 'ready, set ,   go '

In [None]:
# Trimming whitespace
[x.strip() for x in s.split(',')]

# Also see rstrip, lstrip

In [None]:
# String Splitting
' '.join([x.strip() for x in s.split(',')])

In [None]:
'_#_'.join(list('abcde'))

In [None]:
# Concatenating Strings
pieces = list('abcde')
print '::'.join(pieces)
print '--'.join(pieces)
print ' '.join(pieces)

In [None]:
# Does a Substring belong to a string
print 'steady' in s
print 'set' in s

In [None]:
# Locate a substring
s.index('go')

In [None]:
s

In [None]:
s[15:17]

In [None]:
sentence = 'the sun rises in the east'

In [None]:
sentence.index('east') == sentence.find('east')

In [None]:
print sentence.index('west')

In [None]:
print sentence.find('west')

In [None]:
sentence[21:]

In [None]:
sentence.find('ris')

In [None]:
sentence.count('t')

In [None]:
# Locate a substring
s.find(',')

In [None]:
# Count occurrences
s.count(',')

In [None]:
sentence.endswith('east')

In [None]:
s2 = 'the quick brown fox jumps over the lazy dog'
s2.find('fox')

print 'lazy' in s2

print s2.endswith('dog')

In [None]:
s.startswith('ready')
# similarly .endswith()a

<big>

These string functions become very important in conjunction with the `map()` method when we're rying to clean text data.


-----------------------------------------------------------------------------------------------------------------------------

### I. Regular Expressions

https://docs.python.org/2/library/re.html

A Regex is a sequence of characters that define a search pattern used in find-and-replace actions.

Example: The regex
* `\s+` describes one or more whitespaces
* `(?<=\.) {2,}(?=[A-Z])` matches at least two spaces occurring after period (.) and before an upper case letter

Note:
* Before a regex is applied to a string, it must be _compiled_ to create a reusable regex object.
* The object's methods can then be called on a string.
* These include: 
    * **`split`**, 
    * **`findall`** (returns all matches), 
    * **`match`** (checks only the beginning of the string), 
    * **`search`** (returns the first occurrence)
    * **`sub`** (returns a new string with occurrences of the pattern replaced with the supplied string)

Syntax:
1. `import re`
2. `r_obj = re.compile('my-regex')`
3. `r_obj.method(my-text)`


---
# Ch. 8 Plotting and Visualization

---

## 1. matplotlib basics


http://matplotlib.org/

* Run **`import matplotlib.pyplot as plt`**
* Create a figure object using **`plt.figure`**
* Add subplots to it using **`add_subplot`**
    * This creates **AxesSubplot** objects on which you can place plots
* Use a plotting command like **`plt.plot`** and matplotlib will place your plot on this canvas


### 1.1 Figure, Subplots, AxisSubplot objects and your plot

#### Create a 2x2 figure and add three plots to it


In [None]:
import matplotlib.pyplot as plt
# convention

%pylab inline
# brings the plot to jupyter from the console

In [None]:
plt.figure?

In [None]:
# Create an empty figure
fig = plt.figure(figsize=(6, 4))

In [None]:
# Run plt.figure? to check out figure options like size, dpi, color

axsp1 = fig.add_subplot(2, 2, 1)
# There will be 2 x 2 subplots on the figure and axsp1 will put your plot on subplot 1

axsp2 = fig.add_subplot(2, 2, 2)
axsp3 = fig.add_subplot(2, 2, 3)
# Now, we have three AxesSubplot objects on our figure. 

In [None]:
# First plot: timeseries
axsp1.plot(np.random.randn(40).cumsum(), 'r--')

# Second plot: histogram
axsp2.hist(np.random.randn(400), bins=10, color='b', alpha=0.3)

# Third plot: scatterplot
axsp3.scatter(np.arange(30), 4 * np.arange(30) + 6 * np.random.randn(30))
# Note: if you make changes to the AxisSubplot object, you'll have to re-run the commands above

In [None]:
fig

------------------------------------------------------------------------------------------------------------------------

### 1.2 Shorthand to achieve the same effect

* Create a grid figure using **`plt.subplots`**
    * Syntax: `fig, axes = plt.subplots(rows, cols, figsize = (width, height), sharex=False, sharey=False)`
    
* It returns an array of **AxisSubplot** objects 
* Reference them using basic indexing (Saves typing!)

`plt.subplots` has some interesting options such as `sharex/sharey` which are useful when comparing data on the same scale

Run `plt.subplots?` for more.

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(8, 4))

axes[1, 1].plot(np.random.randn(50).cumsum(), 'g-_')
axes[2, 1].scatter(np.arange(30), np.log10(np.arange(30)))

> NOTE:
`subplots.adjust` is a Figure method that can be used to adjust figure parameters like spacing between subplots

In [None]:
fig1, axes1 = plt.subplots(2, 2, figsize=(12, 4), sharex=True, sharey=True)

for i in range(2):
    for j in range(2):
        axes1[i, j].hist(np.random.randn(500), bins=15, alpha=0.4, color='c')

plt.subplots_adjust(wspace=0.2, hspace=0.2)        
# comment out the plt.subplots line and re-run. See what happens

------------------------------------------------------------------------------------------------------------------------

### 1.3 Plot Formatting

#### a. Color, Linestyle and Markers

The `plot` function takes `x, y` and optionally an abbreviation to specify `marker, color, and style`

Example: Abbreviations work as `color-marker-style`, so `'g--'` means color = 'green' and linestyle = '--'

In [None]:
plt.plot(np.sin(np.arange(50)), 'b*-');

#### b. Ticks, Labels, Legends

In [None]:
f = plt.figure(figsize=(8, 5))
ax1 = f.add_subplot(1, 1, 1)
ax1.plot(4 + 6 * np.sin(np.arange(50)), 'g*-');

In [None]:
# Ticks
ax1.set_xticks([5, 15, 25, 35, 45])

# Chart title
ax1.set_title('This is a Sine Curve')

# Axis Label
ax1.set_xlabel('X')
ax1.set_ylabel('4 + 6 * sin(X)')

f

In [None]:
# Add more plots
ax1.plot(np.log(np.arange(50)), 'r', label='log(x)')
ax1.plot(np.sqrt(np.arange(50)), 'b*--', label='sqrt(x)')

# Add a legend
ax1.legend(loc='best')

f

### 1.3 Saving plots to file

**Syntax**: `plt.savefig('file-path.extension', dpi=)`

------------------------------------------------------------------------------------------------------------------------

In [None]:
Series.plot
DataFrame.plot

In [None]:
DataFrame.plot.

# 2. Plotting in `pandas`

* There are high level plotting methods that take advantage of the fact that data are organized in DataFrames (have index, colnames)
* Both `Series` and `DataFrame` objects have a `pandas.plot` method for making different plot types
* Other parameters that can be passed to `pandas.plot` are:
    * `xticks, xlim, yticks, ylim`
    * `label`
    * `style` (as an abbreviation,) and `alpha`
    * `grid=True`
    * `rot` (rotate tick labels by and angle 0-360)
    * `use_index` (use index for tick labels)
    * `subplots=False`

### 2.1 One variable (plotting a Series)

In [None]:
s = Series(np.random.randn(100).cumsum())
s.name = 'random_time_series'
s.plot();
# Default is a line chart

<big> 

Two ways of specifying the kind of plot to make

- `X.plot(kind=<plottype>`
- `X.plot.<plottype>`

Where X is a Series or a DataFrame

---

In [None]:
s.plot(legend=True, title='My First Pandas Plot',
       xlim=(0, 100), ylim=(-20, 20), style='g');

s2 = s * 1.3
s2.plot()

In [None]:
# Chart with options
s.plot(grid=True, 
       legend=True,
       label='timeseries',
       title='Random Normal Numbers - Cumulative Series',
       xlim=(0, 100), 
       ylim=(-8, 4),
       xticks=np.arange(0, 100, 10), 
       yticks=np.arange(-10, 10, 2),
       style='r--', 
       alpha=0.9,
       figsize=(7, 3)
      );

In [None]:
# One Variable as a Histogram
Series(np.random.randn(10000)).plot(kind='hist', 
                                    bins=50, 
                                    color='r', 
                                    alpha=0.7, 
                                    title='A histogram');

In [None]:
from scipy.stats import norm

s2 = norm.rvs(size=10000, loc=4, scale=2.5)
s3 = norm.rvs(size=10000, loc=-2, scale=0.5)

In [None]:
Series(s2).plot.hist(bins=50, color='g', alpha=0.8)
Series(s3).plot(kind='hist', bins=50, color='b', alpha=0.2)
plt.savefig('twoHistograms.png')

In [None]:
(Series(np.random.randint(0, 10, 25))
 .value_counts()
 .sort_index()
 .plot.bar(title='Bar Chart with Random Integers',
          grid=False));

## Plotting with Titanic

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df['Embarked'].value_counts().plot.barh(figsize=(3, 3));

In [None]:
df['Age'].plot.hist(bins=20, figsize=(3, 3))

In [None]:
df['Fare'].plot.hist(figsize=(3, 3), bins=20);

------------------------------------------------------------------------------------------------------------------------

### 2.2 Multiple Variables (plotting a DataFrame)

We can choose between plotting
* All Variables on one plot
* Each variable on a separate plot

In addition to the parameters above, `DataFrame.plot` also takes
* `subplots=False` (default is to plot all on the same figure)
* `sharex=False, sharey=False`
* `figsize`
* `title, legend`
* `sort_columns`

### a. Variables on the same plot

In [None]:
df = DataFrame(np.random.randn(5000, 5), 
               index=['Day_' + str(d) for d in range(5000)],
               columns=['APL', 'FBK', 'GOOG', 'MCRS', 'TWTR']).cumsum().round(3); df[:4]

In [None]:
# Default plot
df.plot(figsize=(10, 4));

------------------------------------------------------------------------------------------------------------------------

### b. Each variable on its own plot

In [None]:
df.plot(figsize=(5, 10), subplots=True);

In [None]:
df.plot(figsize=(5, 10), subplots=True, sharey=True);

## Visualizing Rainfall in Himachal

#### Data from $data.gov.in$

In [None]:
df = pd.read_csv('https://data.gov.in/node/87154/datastore/export/csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
(df['STATE/UT'] == 'HIMACHAL').value_counts()

In [None]:
df.loc[(df['STATE/UT'] == 'HIMACHAL'), 'DISTRICT']

In [None]:
(df[df['STATE/UT'] == 'HIMACHAL']
 .set_index('DISTRICT')
 .drop('STATE/UT', axis=1)
 .loc[:, 'JAN':'DEC'] 
 .T
 .plot(subplots=True, sharey=True, figsize=(8, 24)));

In [None]:
sns.heatmap((df[df['STATE/UT'] == 'HIMACHAL']
 .set_index('DISTRICT')
 .drop('STATE/UT', axis=1)
 .loc[:, 'JAN':'DEC'].T).corr())

In [None]:
(df[df['STATE/UT'] == 'HIMACHAL']
 .set_index('DISTRICT')
 .drop('STATE/UT', axis=1)
 .loc[:, 'JUL']).sort_values().plot.barh(figsize=(3, 6));

In [None]:
df2 = (df[df['STATE/UT'] == 'HIMACHAL']
 .set_index('DISTRICT')
 .drop('STATE/UT', axis=1)
 .loc[:, 'JAN':'DEC']).loc[:, 'JUL':'AUG']

In [None]:
x = range(0, df2.shape[0])

In [None]:
df2

In [None]:
df2.plot.scatter(x='JUL', y='AUG', c=x, cmap="Spectral")

### c. Barplots

This is as simple as passing `kind=bar` or `kind=barh` (for horiz bars) to `pd.plot`

#### One Variable (simple barplot)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 14))
s = Series(np.random.rand(10), index=list('abcdefghij'))

s.plot(kind='bar', 
       ax=axes[0], 
       color='k', 
       alpha=0.6)

s.plot(kind='barh', 
       ax=axes[1], 
       color='k')

s.plot(
    ax=axes[2], 
    color='g')

In [None]:
df = DataFrame(np.random.rand(5,5), index=list('ABCDE'), columns=list('PQRST'))
print df

In [None]:
df.plot(kind='bar', stacked=True, figsize=(10, 8))
plt.savefig('stackedBarcharts.jpeg')

> Note: Functions `value_counts()` and `pd.crosstab()` prove helpful to prepare data for stacked bar charts

------------------------------------------------------------------------------------------------------------------------

### d. Histograms & Density Plots

* _Histograms_: Pass `kind='hist'` to `pd.plot()` or use the method `pd.hist()`
* _Density Plots_: Use `kind='kde'`


### Using the `.hist()` method

In [None]:
Series(np.random.randn(1000)).hist(bins=20, alpha=0.4)

### Using the `.plot()` method

In [None]:
Series(np.random.randn(1000)).plot(kind='hist', bins=20, color='Y')

### KDE

In [None]:
s = Series(np.random.randn(10000))
s.plot(kind='kde', color='b') 

In [None]:
# A bimodal distribution 
s1 = np.random.normal(0, 1, 2000)
s2 = np.random.normal(9, 2, 2000)

v = Series(np.concatenate([s1, s2]))

v.hist(bins=100, alpha=0.4, color='B', normed=True)
v.plot(kind='kde', style='k--')

------------------------------------------------------------------------------------------------------------------------

In [None]:
Image("https://upload.wikimedia.org/wikipedia/commons/thumb/d/d4/Correlation_examples2.svg/2000px-Correlation_examples2.svg.png")

# e. Scatter Plots

- `.plot(kind='scatter')`
- `.scatter()`

In [None]:
df = DataFrame({'A': np.arange(50),
               'B': np.arange(50) + np.random.randn(50),
               'C': np.sqrt(np.arange(50)) + np.sin(np.arange(50)) })
print df[:10]

In [None]:
# Two variable Scatterplot
plt.scatter(df['B'], df['C'])
plt.title('Scatterplot of X and Y')

In [None]:
df.plot(kind='scatter', x='B', y='C', title = 'Scatterplot')

In [None]:
df.plot.scatter(x='B', y='C', title = 'Scatterplot', color='r')

## Scatterplot Matrix

A MOST important visual that allows you to see, for numeric variables:

- The distribution of each (histograms or kde along the diagonal)
- The relationships between variables (as pairwise scatterplots)

In [None]:
df = pd.read_csv('https://data.gov.in/node/87154/datastore/export/csv')

In [None]:
pd.scatter_matrix(df.loc[df['STATE/UT'] == 'HIMACHAL', 'JAN':'JUL'], alpha=0.5, figsize=(12, 6))
tight_layout()

In [None]:
pd.scatter_matrix(df, diagonal='kde', color='k', alpha=0.5, figsize=(12, 6))
tight_layout()

------------------------------------------------------------------------------------------------------------------------

# _Split - Apply - Combine_

## Advanced GroupBy

In Data Analysis workflows, operations like `loading, cleaning and merging` are usually following by `summarizations` using some grouping variable(s). This includes _summary statistics_ over variables or groups within variables, within-group _transformations_ (like variable standardization), computing _pivot-tables_ and group analyses.

* _Split:_
    * A DataFrame can be split up by rows(`axis=0`)/columns(`axis=1`) into **groups**. 
    * We use `pd.groupby()` to create a groupby object
* _Apply:_
    * A function is applied to each group.
* _Combine:_
* The results of applying functions to groups are put together into an object 
    * data types of returned objects are handled gracefully by pandas
    


In [None]:
df.groupby('STATE/UT').apply(lambda g: g.loc[:, 'JAN':'DEC'].median()).T.head()

In [None]:
df_rainfall = df.copy()

In [None]:
from IPython.display import Image
Image("http://i.imgur.com/yjNkiwL.png")

In [None]:
df = DataFrame({'k1': list('abcd' * 25),
               'k2': list('xy' * 25 + 'yx' * 25),
               'v1': np.random.rand(100),
               'v2': np.random.rand(100)}); df[:15]

#### Syntax

`df.groupby('[<col-name(s)>]').apply(<udfs>) or <existing-function>`

### Grouping by one key 

**Results in a summarized data frame indexed by levels of the key**

In [None]:
print '\n', df.groupby('k1').mean()
print '\n', df.groupby('k2').sum()

### Grouping by two keys

**Results in a summarized data frame with a hierarchical index**

In [None]:
print df.groupby(['k1', 'k2']).mean()

In [None]:
grpd = df.groupby(['k1', 'k2'])

In [None]:
type(grpd)

In [None]:
print grpd['v1'].sum()
print
print grpd['v2'].median()

In [None]:
grpd.agg?

In [None]:
grpd.agg({'v1': 'mean',
          'v2': 'sum'})

In [None]:
all(grpd['v1'].sum() == grpd['v1'].apply(np.sum))

---
# ** GroupBy objects **

* `DataFrame.groupby(<key>)` will produce a groupby object
* have a `.size()` method, which returns the count of elements in each group.
* can be subsetted using column names (or arrays of column names) to select variables for aggregation
* have optimized methods for general aggregation operations like - 
    * `count, sum`
    * `mean, median, std, var`
    * `first, last`
    * `min, max`
* methods like `.describe` apply to these objects

** By far, the most important GroupBy methods are `.agg() .transform()`, and `.apply()` **

In [None]:
rain_grpby = df_rainfall.groupby('STATE/UT')

In [None]:
rain_grpby.size().sort_values(ascending=False)[:5]

In [None]:
obj = df.groupby(['k1'])

In [None]:
obj

In [None]:
obj.size()

In [None]:
print obj.mean()

### Groupby objects Methods

In [None]:
obj.agg?

---
### Task 1: 

1. Create a 100x4 DataFrame filled with random numbers (from a normal distribution.)
Ensure that there's 2 categorical columns with 5 and 3 categories each.

2. Create the groupby object using both keys and find the mean, max, median for each group.

---

### 9.1 Column-wise aggregations and UDFs

** For simple aggregations (Series or all numeric columns of a DataFrame) we can call methods like `mean` and `sum` **

In [None]:
# Summing a Series
# Syntax: Select a Series - GroupBy - Apply function
df.groupby('k1')['v1'].sum()

In [None]:
# Summing all Series of a DataFrame
# Syntax: Select DF - Groupby - Apply
print df.groupby('k2').mean()

** or you can pass the name of a function as a string with the `.agg()` method **

In [None]:
df.groupby('k1')['v1'].agg('sum')

In [None]:
print df.groupby('k1').agg('mean').add_prefix('mu_')

In [None]:
pd.concat([df.groupby(df.k1).agg('mean').add_prefix('mu_'),
           df.groupby(df.k1).agg('std').add_prefix('sigma_')], axis=1)

---
### The `.agg()` method

takes as argument the following:
* list of function names to be applied to all selected columns
* tuples of (colname, function) to be applied to all selected columns
* dict of (df.col, function) to be applied to each df.col

** 1. Apply >1 functions to selected column(s) by passing names of functions to `agg()`**

In [None]:
# Apply min, mean, max and max to v1 grouped by k1
df.groupby('k1').agg(['min', 'mean', 'max'])

In [None]:
# Apply min and max to all numeric columns of df grouped by k2
df.groupby('k2')[['v1', 'v2']].agg(['min', 'max'])

In [None]:
# Hierarchical index will be created
# We can call .stack on the returned object!

df.groupby('k2')[['v1', 'v2']].agg(['min', 'max']).stack()

### 2. We can supply names for the columns in the aggregated df

to the agg() method, in a list of tuples as `[(colname1, func1), (colname2, func2) ...] `

In [None]:
print df.groupby('k1')[['v1', 'v2']].agg([('smallest', 'min'), ('largest', 'max')])

### 3. We can supply df columns and which funcs to apply to each

to the agg() method in a dictionary

In [None]:
# Apply max and min to v1; and mean and sum to v2; all grouped by k1
print df.groupby('k1').agg({'v1': ['max', 'min'], 
                            'v2': ['mean', 'sum']})

---
### The `.apply()` method

takes as argument the following:
* a general or user defined function
* any other parameters that the function would take

In [None]:
df.groupby('k1').apply?

In [None]:
def topN(data, col, N):
    return data.sort_values(by=col, ascending=False).loc[:, col].head(N)

In [None]:
df.groupby('k2').apply(topN, col='v1', N=3)

In [None]:
df.groupby('k1').apply(topN, col='v2', N=2)

In [None]:
def analyze(df):
    return pd.Series({"nrow": len(df), "ncol": len(df.columns)})

In [None]:
# Rows and Cols per group
print df.groupby("k1").apply(analyze)

---
## Time Series Functions

In [None]:
dates = pd.date_range('1950-01', '2013-03', freq='M'); dates

In [None]:
ts = DataFrame(np.random.randn(758, 4), columns=list('ABCD'), index=dates)

In [None]:
ts.head()

In [None]:
ts['year'] = ts.index.year

In [None]:
ts.head()

In [None]:
# Aggregating data by year
print ts.groupby('year').sum().tail(20)

In [None]:
# Visualize Trends over time
ts.drop('year', axis=1).cumsum().plot()

In [None]:
# Subsetting data for a decade
ts['1980':'1990'].drop('year', axis=1).cumsum().plot()

---
### Groupby Practice Tasks - Baseball Data

1. Import the data from this link http://bit.ly/144sh7t (hint: use read_csv) Call it `baseball`
2. Check column types, dataframe shape
3. How many rows have missing data? 
3. Find the proportion of missing values in each column
4. Find
    - The number of rows in every league
    - The count of records per year
    - Average, Median experience of players participating in each year

In [None]:
baseball = pd.read_csv("http://bit.ly/144sh7t")

In [None]:
baseball.columns.values