### Rapid Overview
- build intuition about pandas
- details later

documentation: http://pandas.pydata.org/pandas-docs/stable/10min.html

In [50]:
import pandas as pd
import numpy as np

##### Basic series; default integer index
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

In [51]:
my_series = pd.Series([1,3,5,np.nan,6,8])
my_series

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

##### datetime index
documentation: http://pandas.pydata.org/pandas-docs/stable/timeseries.html

In [52]:
my_dates_index = pd.date_range('11/13/2018', periods=6,freq='W-WED')
my_dates_index

DatetimeIndex(['2018-11-14', '2018-11-21', '2018-11-28', '2018-12-05',
               '2018-12-12', '2018-12-19'],
              dtype='datetime64[ns]', freq='W-WED')

##### sample NumPy data


In [53]:
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
sample_numpy_data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

##### sample data frame, with column headers; uses our dates_index
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

In [54]:
sample_df = pd.DataFrame(sample_numpy_data, index=my_dates_index, columns=list('ABCD'))
sample_df
#print (list('ABCD'))

Unnamed: 0,A,B,C,D
2018-11-14,0,1,2,3
2018-11-21,4,5,6,7
2018-11-28,8,9,10,11
2018-12-05,12,13,14,15
2018-12-12,16,17,18,19
2018-12-19,20,21,22,23


In [75]:
sample_df.B

2018-11-14     1
2018-11-21     5
2018-11-28     9
2018-12-05    13
2018-12-12    17
2018-12-19    21
Freq: W-WED, Name: B, dtype: int64

##### data frame from a Python dictionary

In [84]:
df_from_dictionary = pd.DataFrame({ 
                         'float' : 1.,
                         'time' : pd.Timestamp('20160825'),
                         'series' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'array' : np.array([3] * 4,dtype='int32'),
                         'categories' : pd.Categorical(["test","train","taxes","tools"]),
                         'dull' : 'boring data' 
                      })
df_from_dictionary

Unnamed: 0,float,time,series,array,categories,dull
0,1.0,2016-08-25,1.0,3,test,boring data
1,1.0,2016-08-25,1.0,3,train,boring data
2,1.0,2016-08-25,1.0,3,taxes,boring data
3,1.0,2016-08-25,1.0,3,tools,boring data


In [86]:
[3] * 4

[3, 3, 3, 3]

##### pandas retains data type for each column

In [56]:
df_from_dictionary.dtypes

float                float64
time          datetime64[ns]
series               float32
array                  int32
categories          category
dull                  object
dtype: object

##### head and tail; default is 5 rows

In [57]:
sample_df.head()

Unnamed: 0,A,B,C,D
2018-11-14,0,1,2,3
2018-11-21,4,5,6,7
2018-11-28,8,9,10,11
2018-12-05,12,13,14,15
2018-12-12,16,17,18,19


In [58]:
sample_df.tail(2)

Unnamed: 0,A,B,C,D
2018-12-12,16,17,18,19
2018-12-19,20,21,22,23


##### underlying data: values, index and columns

In [59]:
sample_df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [60]:
sample_df.index

DatetimeIndex(['2018-11-14', '2018-11-21', '2018-11-28', '2018-12-05',
               '2018-12-12', '2018-12-19'],
              dtype='datetime64[ns]', freq='W-WED')

In [61]:
sample_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

##### describe(): a quick statistical summary
- notice: integer data summarized with floating point numbers

In [62]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.48,7.48,7.48,7.48
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


##### control precision of floating point numbers
for options and settings, please see: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html

In [63]:
pd.set_option('display.precision', 2)

In [64]:
sample_df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0
std,7.48,7.48,7.48,7.48
min,0.0,1.0,2.0,3.0
25%,5.0,6.0,7.0,8.0
50%,10.0,11.0,12.0,13.0
75%,15.0,16.0,17.0,18.0
max,20.0,21.0,22.0,23.0


##### transpose rows and columns

In [65]:
sample_df.T

Unnamed: 0,2018-11-14 00:00:00,2018-11-21 00:00:00,2018-11-28 00:00:00,2018-12-05 00:00:00,2018-12-12 00:00:00,2018-12-19 00:00:00
A,0,4,8,12,16,20
B,1,5,9,13,17,21
C,2,6,10,14,18,22
D,3,7,11,15,19,23


##### sort by axis

In [66]:
sample_df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-11-14,3,2,1,0
2018-11-21,7,6,5,4
2018-11-28,11,10,9,8
2018-12-05,15,14,13,12
2018-12-12,19,18,17,16
2018-12-19,23,22,21,20


##### sort by data within a column (our data was already sorted)

In [67]:
sample_df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2018-12-19,20,21,22,23
2018-12-12,16,17,18,19
2018-12-05,12,13,14,15
2018-11-28,8,9,10,11
2018-11-21,4,5,6,7
2018-11-14,0,1,2,3
