In [1]:
import pandas as pd
import numpy as np


starting_date = '20160701'
sample_numpy_data = np.array(np.arange(30)).reshape((6,5))
dates_index = pd.date_range(starting_date, periods=6)
sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCDE'))

sample_df_2 = sample_df.copy()
sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']

sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))
sample_df_2['Extra Data'] = sample_series *3 +1

second_numpy_array = np.array(np.arange(len(sample_df_2)))  *100 + 7
sample_df_2['G'] = second_numpy_array

sample_df_2

Unnamed: 0,A,B,C,D,E,Fruits,Extra Data,G
2016-07-01,0,1,2,3,4,apple,4,7
2016-07-02,5,6,7,8,9,orange,7,107
2016-07-03,10,11,12,13,14,banana,10,207
2016-07-04,15,16,17,18,19,strawberry,13,307
2016-07-05,20,21,22,23,24,blueberry,16,407
2016-07-06,25,26,27,28,29,pineapple,19,507


### Missing Data
pandas uses np.nan to represent missing data. By default, it is not included in computations.

documentation: http://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data

##### reindex()
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html

In [2]:
browser_index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']

browser_df = pd.DataFrame({
      'http_status': [200,200,404,404,301],
      'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
       index=browser_index)
browser_df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


##### reindex() creates a copy (not a view)

In [3]:
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', 'Chrome']
browser_df_2 = browser_df.reindex(new_index)
browser_df_2

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


##### drop rows that have missing data
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html

In [4]:
browser_df_3 = browser_df_2.dropna(how='any')
browser_df_3

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
IE10,404.0,0.08
Chrome,200.0,0.02


##### fill-in missing data
documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html

In [6]:
browser_df_2.fillna(value=-0.05555)


Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,-0.05555,-0.05555
Comodo Dragon,-0.05555,-0.05555
IE10,404.0,0.08
Chrome,200.0,0.02


##### get boolean mask where values are nan

In [7]:
pd.isnull(browser_df_2)

Unnamed: 0,http_status,response_time
Safari,False,False
Iceweasel,True,True
Comodo Dragon,True,True
IE10,False,False
Chrome,False,False


##### NaN propagates during arithmetic operations

In [8]:
browser_df_2 * 17

Unnamed: 0,http_status,response_time
Safari,6868.0,1.19
Iceweasel,,
Comodo Dragon,,
IE10,6868.0,1.36
Chrome,3400.0,0.34
