#### Installing Pandas
pip install pandas 

### Reference Link for pandas
<a href="https://pandas.pydata.org/pandas-docs/stable/reference/index.html">API reference Pandas</a>

# Importing pandas into your application

In [1]:
# import numpy and pandas, and DataFrame / Series
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

# Set some pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# And some items for matplotlib
#%matplotlib inline 
#import matplotlib.pyplot as plt
#pd.options.display.mpl_style = 'default'

# Primary pandas objects

## The pandas Series

In [2]:
# create a four item Series
s = Series([5, 8, 3, 4])
s

0    5
1    8
2    3
3    4
dtype: int64

In [12]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [11]:
s[1:3]

b    8
c    3
dtype: int64

In [10]:
# return a Series with the row with labels 1 and 3
s[[1,3]]

b    8
d    4
dtype: int64

In [9]:
# create a series using an explicit index
s = Series([5, 8, 3, 4], index = ['a', 'b', 'c', 'd'])
s

a    5
b    8
c    3
d    4
dtype: int64

In [17]:
s['a':'d']

a    5
b    8
c    3
d    4
dtype: int64

In [63]:
# look up items the series having index 'a' and 'd'
s[['b', 'd']]

b    8
d    4
dtype: int64

In [19]:
# passing a list of integers to a Series that has
# non-integer index labels will look up based upon
# 0-based index like an array
s[[1, 3]]

b    8
d    4
dtype: int64

In [20]:
# get only the index of the Series
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [36]:
# create a Series who's index is a series of dates
# between the two specified dates (inclusive)
dates1 = pd.date_range('2019-07-01', '2019-07-06')
dates2 = pd.date_range('2019-07-02', '2019-07-07')
dates

# create a Series with values(representing temperatures)
# for each date in the index
temps1 = Series([80, 82, 85, 90, 83, 87], 
                index = dates1)
print(temps1)
print(temps1.index)


2019-07-01    80
2019-07-02    82
2019-07-03    85
2019-07-04    90
2019-07-05    83
2019-07-06    87
Freq: D, dtype: int64
DatetimeIndex(['2019-07-01', '2019-07-02', '2019-07-03', '2019-07-04',
               '2019-07-05', '2019-07-06'],
              dtype='datetime64[ns]', freq='D')


In [37]:
# calculate the mean of the values in the Series
temps1.mean()

84.5

In [38]:
temps1.max()

90

In [29]:
temps1.min()

80

In [30]:
temps1.sum()

507

In [31]:
temps1.cumsum()

2019-07-01     80
2019-07-02    162
2019-07-03    247
2019-07-04    337
2019-07-05    420
2019-07-06    507
Freq: D, dtype: int64

In [42]:
# create a second series of values using the same index
temps2 = Series([70, 75, 69, 83, 79, 77], 
                index = dates2)
print(temps2)
print(temps2.index)

2019-07-02    70
2019-07-03    75
2019-07-04    69
2019-07-05    83
2019-07-06    79
2019-07-07    77
Freq: D, dtype: int64
DatetimeIndex(['2019-07-02', '2019-07-03', '2019-07-04', '2019-07-05',
               '2019-07-06', '2019-07-07'],
              dtype='datetime64[ns]', freq='D')


In [43]:
print(temps1)

2019-07-01    80
2019-07-02    82
2019-07-03    85
2019-07-04    90
2019-07-05    83
2019-07-06    87
Freq: D, dtype: int64


In [44]:
print(temps2)

2019-07-02    70
2019-07-03    75
2019-07-04    69
2019-07-05    83
2019-07-06    79
2019-07-07    77
Freq: D, dtype: int64


In [47]:

# the following aligns the two by their index values
# and calculates the difference at those matching labels
temp_diffs = temps1 - temps2
print(type(temp_diffs))
temp_diffs


<class 'pandas.core.series.Series'>


2019-07-01     NaN
2019-07-02    12.0
2019-07-03    10.0
2019-07-04    21.0
2019-07-05     0.0
2019-07-06     8.0
2019-07-07     NaN
Freq: D, dtype: float64

In [48]:
# lookup a value by date using the index
temp_diffs['2019-07-03']

10.0

In [49]:
# and also possible by integer position as if the 
# series was an array
temp_diffs[3]

21.0

## The pandas DataFrame

In [52]:
# create a DataFrame from the two series objects temp1 and temp2
# and give them column names
temps_df = DataFrame(
            {'Missoula': temps1, 
             'Philadelphia': temps2})
print(type(temps_df))
temps_df

<class 'pandas.core.frame.DataFrame'>


            Missoula  Philadelphia
2019-07-01      80.0           NaN
2019-07-02      82.0          70.0
2019-07-03      85.0          75.0
2019-07-04      90.0          69.0
2019-07-05      83.0          83.0
2019-07-06      87.0          79.0
2019-07-07       NaN          77.0

In [53]:
# get the column with the name Missoula
temps_df['Missoula']

2019-07-01    80.0
2019-07-02    82.0
2019-07-03    85.0
2019-07-04    90.0
2019-07-05    83.0
2019-07-06    87.0
2019-07-07     NaN
Freq: D, Name: Missoula, dtype: float64

In [76]:
# likewise we can get just the Philadelphia column
temps_df['Philadelphia']

2014-07-01     NaN
2014-07-02    70.0
2014-07-03    75.0
2014-07-04    69.0
2014-07-05    83.0
2014-07-06    79.0
2014-07-07    77.0
Freq: D, Name: Philadelphia, dtype: float64

In [77]:
# return both columns in a different order
x = temps_df[['Philadelphia', 'Missoula']]
print(type(x))

<class 'pandas.core.frame.DataFrame'>


In [54]:
x = temps_df['Missoula']
print(type(x))

<class 'pandas.core.series.Series'>


In [56]:
print(temps_df.Missoula)

2019-07-01    80.0
2019-07-02    82.0
2019-07-03    85.0
2019-07-04    90.0
2019-07-05    83.0
2019-07-06    87.0
2019-07-07     NaN
Freq: D, Name: Missoula, dtype: float64


In [68]:
print(temps_df.Missoula['2019-07-03'])
print("###########################################")
print(temps_df.Missoula['2019-07-03':'2019-07-05'])
print("###########################################")
print(temps_df.Missoula.index)
print(type(temps_df.Missoula))
s = temps_df.Missoula
#print(s[['2019-07-03','2019-07-05'], dtype = 'datetime64[ns]'])

SyntaxError: invalid syntax (<ipython-input-68-57f18bae33a5>, line 8)

In [55]:
# retrieve the Missoula column through property syntax
print(type(temps_df.Missoula))
temps_df.Missoula

<class 'pandas.core.series.Series'>


2019-07-01    80.0
2019-07-02    82.0
2019-07-03    85.0
2019-07-04    90.0
2019-07-05    83.0
2019-07-06    87.0
2019-07-07     NaN
Freq: D, Name: Missoula, dtype: float64

In [80]:
# calculate the temperature difference between the two cities
temps_df.Missoula - temps_df.Philadelphia

2014-07-01     NaN
2014-07-02    12.0
2014-07-03    10.0
2014-07-04    21.0
2014-07-05     0.0
2014-07-06     8.0
2014-07-07     NaN
Freq: D, dtype: float64

In [78]:
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temp_diffs
temps_df['Difference1'] = temps_df.Missoula - temps_df.Philadelphia
print(temps_df)
print(temps_df.index)

            Missoula  Philadelphia  Difference  Difference1
2019-07-01      80.0           NaN         NaN          NaN
2019-07-02      82.0          70.0        12.0         12.0
2019-07-03      85.0          75.0        10.0         10.0
2019-07-04      90.0          69.0        21.0         21.0
2019-07-05      83.0          83.0         0.0          0.0
2019-07-06      87.0          79.0         8.0          8.0
2019-07-07       NaN          77.0         NaN          NaN
DatetimeIndex(['2019-07-01', '2019-07-02', '2019-07-03', '2019-07-04',
               '2019-07-05', '2019-07-06', '2019-07-07'],
              dtype='datetime64[ns]', freq='D')


In [71]:
# get the columns, which is also an Index object
temps_df.columns

Index(['Missoula', 'Philadelphia', 'Difference', 'Difference1'], dtype='object')

In [73]:
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:6]

2019-07-02    12.0
2019-07-03    10.0
2019-07-04    21.0
2019-07-05     0.0
2019-07-06     8.0
Freq: D, Name: Difference, dtype: float64

In [74]:
# get the row at array position 1
temps_df.iloc[6]

Missoula         NaN
Philadelphia    77.0
Difference       NaN
Difference1      NaN
Name: 2019-07-07 00:00:00, dtype: float64

In [85]:
# get the row at array position 1
temps_df.iloc[1:6]

            Missoula  Philadelphia  Difference  Difference1
2014-07-02      82.0          70.0        12.0         12.0
2014-07-03      85.0          75.0        10.0         10.0
2014-07-04      90.0          69.0        21.0         21.0
2014-07-05      83.0          83.0         0.0          0.0
2014-07-06      87.0          79.0         8.0          8.0

In [75]:
temps_df.iloc[1:5:2]

            Missoula  Philadelphia  Difference  Difference1
2019-07-02      82.0          70.0        12.0         12.0
2019-07-04      90.0          69.0        21.0         21.0

In [76]:
# the names of the columns have become the index
# they have been 'pivoted'
print(temps_df)
temps_df.ix[1:5:3]

            Missoula  Philadelphia  Difference  Difference1
2019-07-01      80.0           NaN         NaN          NaN
2019-07-02      82.0          70.0        12.0         12.0
2019-07-03      85.0          75.0        10.0         10.0
2019-07-04      90.0          69.0        21.0         21.0
2019-07-05      83.0          83.0         0.0          0.0
2019-07-06      87.0          79.0         8.0          8.0
2019-07-07       NaN          77.0         NaN          NaN


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


            Missoula  Philadelphia  Difference  Difference1
2019-07-02      82.0          70.0        12.0         12.0
2019-07-05      83.0          83.0         0.0          0.0

In [88]:
# retrieve row by index label using .loc
temps_df.loc['2014-07-03']

Missoula        85.0
Philadelphia    75.0
Difference      10.0
Difference1     10.0
Name: 2014-07-03 00:00:00, dtype: float64

In [89]:
temps_df

            Missoula  Philadelphia  Difference  Difference1
2014-07-01      80.0           NaN         NaN          NaN
2014-07-02      82.0          70.0        12.0         12.0
2014-07-03      85.0          75.0        10.0         10.0
2014-07-04      90.0          69.0        21.0         21.0
2014-07-05      83.0          83.0         0.0          0.0
2014-07-06      87.0          79.0         8.0          8.0
2014-07-07       NaN          77.0         NaN          NaN

In [90]:
temps_df.Difference[[1,3,5]]

2014-07-02    12.0
2014-07-04    21.0
2014-07-06     8.0
Name: Difference, dtype: float64

In [91]:
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference

2014-07-02    12.0
2014-07-04    21.0
2014-07-06     8.0
Freq: 2D, Name: Difference, dtype: float64

In [92]:
temps_df>82

            Missoula  Philadelphia  Difference  Difference1
2014-07-01     False         False       False        False
2014-07-02     False         False       False        False
2014-07-03      True         False       False        False
2014-07-04      True         False       False        False
2014-07-05      True          True       False        False
2014-07-06      True         False       False        False
2014-07-07     False         False       False        False

In [93]:
temps_df[temps_df>5]

            Missoula  Philadelphia  Difference  Difference1
2014-07-01      80.0           NaN         NaN          NaN
2014-07-02      82.0          70.0        12.0         12.0
2014-07-03      85.0          75.0        10.0         10.0
2014-07-04      90.0          69.0        21.0         21.0
2014-07-05      83.0          83.0         NaN          NaN
2014-07-06      87.0          79.0         8.0          8.0
2014-07-07       NaN          77.0         NaN          NaN

In [94]:
# which values in the Missoula column are > 82?
temps_df.Missoula > 82

2014-07-01    False
2014-07-02    False
2014-07-03     True
2014-07-04     True
2014-07-05     True
2014-07-06     True
2014-07-07    False
Freq: D, Name: Missoula, dtype: bool

In [95]:
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]

            Missoula  Philadelphia  Difference  Difference1
2014-07-03      85.0          75.0        10.0         10.0
2014-07-04      90.0          69.0        21.0         21.0
2014-07-05      83.0          83.0         0.0          0.0
2014-07-06      87.0          79.0         8.0          8.0

In [96]:
###### provides default 5 values from top
temps_df.head()

            Missoula  Philadelphia  Difference  Difference1
2014-07-01      80.0           NaN         NaN          NaN
2014-07-02      82.0          70.0        12.0         12.0
2014-07-03      85.0          75.0        10.0         10.0
2014-07-04      90.0          69.0        21.0         21.0
2014-07-05      83.0          83.0         0.0          0.0

In [97]:
###### provides default 6 values from top
temps_df.head(3)

            Missoula  Philadelphia  Difference  Difference1
2014-07-01      80.0           NaN         NaN          NaN
2014-07-02      82.0          70.0        12.0         12.0
2014-07-03      85.0          75.0        10.0         10.0

In [98]:
###### provides default 6 values from top
temps_df.tail(2)

            Missoula  Philadelphia  Difference  Difference1
2014-07-06      87.0          79.0         8.0          8.0
2014-07-07       NaN          77.0         NaN          NaN

# Loading data from files and the web into a DataFrame

In [99]:
# display the contents of test1.csv
# which command to use depends on your OS
!cat data/test1.csv # on non-windows systems
#!type data/test1.csv # on windows systems

'cat' is not recognized as an internal or external command,
operable program or batch file.


###### Accessing Data

In [100]:
# read the contents of the file into a DataFrame
df = pd.read_csv('test1.csv')
df

                  date         0         1         2
0  2000-01-01 00:00:00  1.103763 -1.909979 -0.808956
1  2000-01-02 00:00:00  1.188917  0.581120  0.861597
2  2000-01-03 00:00:00 -0.964200  0.779764  1.829062
3  2000-01-04 00:00:00  0.782130 -1.720670 -1.108242
4  2000-01-05 00:00:00 -1.867017 -0.528368 -2.488309
5  2000-01-06 00:00:00  2.569280 -0.471901 -0.835033
6  2000-01-07 00:00:00 -0.399323 -0.676427 -0.011256
7  2000-01-08 00:00:00  1.642993  1.013420  1.435667
8  2000-01-09 00:00:00  1.147308  2.138000  0.554171
9  2000-01-10 00:00:00  0.933766  1.387155 -0.560143

In [101]:
eval(df.0)

SyntaxError: invalid syntax (<ipython-input-101-03e6522cb641>, line 1)

In [None]:
# the contents of the date column
df[["0","2"]]

In [None]:
# we can get the first value in the date column
df.date[0]

In [None]:
# it is a string
type(df.date[0])

In [None]:
type(df.date)

In [None]:
# read the data and tell pandas the date column should be 
# a date in the resulting DataFrame
df = pd.read_csv('test1.csv', parse_dates=['date'])
df

In [None]:
df.index

In [None]:
# verify the type now is date
# in pandas, this is actually a Timestamp
type(df.date[0])

In [None]:
# unfortunately the index is numeric which makes
# accessing data by date more complicated
df.index

In [None]:
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv('test1.csv', 
                 parse_dates=['date'], 
                 index_col='date')
df

In [None]:
# and the index is now a DatetimeIndex
df.index

In [None]:
# imports for reading data from Yahoo!
from pandas.io.data import DataReader
from datetime import date
from dateutil.relativedelta import relativedelta

# read the last three months of data for GOOG
goog = DataReader("GOOG",  "yahoo", 
                  date.today() + 
                  relativedelta(months=-3))

# the result is a DataFrame
#and this gives us the 5 most recent prices
goog.tail()

In [None]:
# use column 0 as the index
msft = pd.read_csv("msft.csv", index_col=1)
print(msft)

In [None]:
# examine the types of the columns in this DataFrame
msft.dtypes

In [None]:
# specify that the Volume column should be a float64
msft = pd.read_csv("msft.csv", 
                   dtype = { 'Volume' : np.float64})
msft.dtypes


In [None]:
msft = pd.read_csv("msft.csv", dtype = {'High': str ,'Volume' : np.float64})
msft.Low = msft.High.astype(int,errors="raise")# errors ="ignore" or "raise"
print(msft.dtypes)
msft.head(20)

## Specifying column names

In [None]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=0 skips the header row
df = pd.read_csv("msft.csv", 
                 header=3,
                 names=['open', 'high', 'low','close', 'volume', 'adjclose1'])
df.head(100)

In [None]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=0 skips the header row
df = pd.read_csv("msft.csv", 
                 header=4)
df.head(100)

### Specifying specific columns to load

In [102]:
# read in data only in the Date and Close columns
# and index by the Date column
df2 = pd.read_csv("msft.csv", 
                  usecols=['Date', 'Close','Low'], 
                  index_col=['Date'])
df2.head()

               Low  Close
Date                     
2014-07-21  a81.81  81.93
2014-07-18   82.52  83.35
2014-07-17   83.33  83.63
2014-07-16   83.66  84.91
2014-07-15   83.20  83.58

## Saving a DataFrame to a CSV

In [103]:
# save df2 to a new csv file
# also specify naming the index as date
df2.to_csv("msft_modified.csv", index_label='date')

In [None]:
# view the start of the file just saved
!head data/msft_modified.csv
#type data/msft_modified.csv # windows

# Working with missing data

## Setup

In [104]:
import pandas as pd
# create a DataFrame with 5 rows and 3 columns
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), 
               index=['a', 'b', 'c', 'd', 'e'], 
               columns=['c1', 'c2', 'c3'])
df

   c1  c2  c3
a   0   1   2
b   3   4   5
c   6   7   8
d   9  10  11
e  12  13  14

In [105]:
# add some columns and rows to the DataFrame
# column c4 with NaN values
df['c4'] = np.nan
df

   c1  c2  c3  c4
a   0   1   2 NaN
b   3   4   5 NaN
c   6   7   8 NaN
d   9  10  11 NaN
e  12  13  14 NaN

In [106]:
# row 'f' with 15 through 18 
df.loc['f'] = np.arange(15, 19) 
# row 'g' will all NaN
print(df)

   c1  c2  c3    c4
a   0   1   2   NaN
b   3   4   5   NaN
c   6   7   8   NaN
d   9  10  11   NaN
e  12  13  14   NaN
f  15  16  17  18.0


In [107]:
df.loc['g'] = np.nan
# column 'C5' with NaN's
df

     c1    c2    c3    c4
a   0.0   1.0   2.0   NaN
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

In [111]:
df['c5'] = np.nan
# change value in col 'c4' row 'a'


In [117]:
df['c4']['a'] = 20
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [118]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

## Determining NaN values in Series and DataFrame objects

In [119]:
# which items are NaN?
df.isnull()

      c1     c2     c3     c4    c5
a  False  False  False  False  True
b  False  False  False   True  True
c  False  False  False   True  True
d  False  False  False   True  True
e  False  False  False   True  True
f  False  False  False  False  True
g   True   True   True   True  True

In [120]:
# total count of NaN values
nullsumforcolumns = df.isnull().sum()
nullsumforcolumns

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [121]:
nullsumforcolumns.sum()

15

In [122]:
# number of non-NaN values in each column
df.count()

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [123]:
# which items are not null?
df.notnull()

      c1     c2     c3     c4     c5
a   True   True   True   True  False
b   True   True   True  False  False
c   True   True   True  False  False
d   True   True   True  False  False
e   True   True   True  False  False
f   True   True   True   True  False
g  False  False  False  False  False

In [124]:
# which items are not null?
df.notnull().sum()

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [125]:
# which items are not null?
df.notnull().sum().sum()

20

# Selecting out (dropping) missing data

In [126]:
df.notnull()

      c1     c2     c3     c4     c5
a   True   True   True   True  False
b   True   True   True  False  False
c   True   True   True  False  False
d   True   True   True  False  False
e   True   True   True  False  False
f   True   True   True   True  False
g  False  False  False  False  False

In [129]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [127]:
df[df.notnull()]

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [128]:
df[df.isnull()]

   c1  c2  c3  c4  c5
a NaN NaN NaN NaN NaN
b NaN NaN NaN NaN NaN
c NaN NaN NaN NaN NaN
d NaN NaN NaN NaN NaN
e NaN NaN NaN NaN NaN
f NaN NaN NaN NaN NaN
g NaN NaN NaN NaN NaN

In [131]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [132]:
df.c4.notnull()

a     True
b    False
c    False
d    False
e    False
f     True
g    False
Name: c4, dtype: bool

In [138]:
# select the non-NaN items in column c4
df.c4[df.c4.notnull()]

a    20.0
f    18.0
Name: c4, dtype: float64

In [135]:
df[df.c4.notnull()]

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
f  15.0  16.0  17.0  18.0 NaN

In [141]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [140]:
# .dropna will also return non NaN values
# this gets all non NaN items in column c4
df.c4.dropna()

a    20.0
f    18.0
Name: c4, dtype: float64

In [142]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [143]:
df['c5']['a'] = 1
df

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   NaN   NaN   NaN   NaN  NaN

In [144]:
df['c1']['g'] = 1

In [145]:
df

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   1.0   NaN   NaN   NaN  NaN

In [146]:
df.dropna()

### all the columns should be not null for a row 
## if one colum value is null or Nan for a row, dropna method will drop that row

    c1   c2   c3    c4   c5
a  0.0  1.0  2.0  20.0  1.0

In [147]:
# dropna returns a copy with the values dropped
# the source DataFrame / column is not changed
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [149]:
# using how='all', only rows that have all values
# as NaN will be dropped
a = df.dropna(how = 'any')
a 

    c1   c2   c3    c4   c5
a  0.0  1.0  2.0  20.0  1.0

In [150]:
# using how='all', only rows that have all values
# as NaN will be dropped
a = df.dropna(how = 'all')
a 

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   1.0   NaN   NaN   NaN  NaN

In [151]:
df['c5'][0] = np.nan

In [152]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   1.0   NaN   NaN   NaN NaN

In [155]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=1) # say goodbye to c5, axis = 1 column

     c1    c2    c3    c4
a   0.0   1.0   2.0  20.0
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   1.0   NaN   NaN   NaN

In [156]:
df.loc["g"] = np.nan

In [157]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [161]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=0) # say goodbye to c5

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN

In [162]:
# make a copy of df
df2 = df.copy()

# replace two NaN cells with values
df2.ix['g'].c1 = 0
df2.ix['g'].c3 = 0

df2

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   0.0   NaN   0.0   NaN NaN

In [163]:
# now drop columns with any NaN values
df2.dropna(how='any', axis=0) 

Empty DataFrame
Columns: [c1, c2, c3, c4, c5]
Index: []

In [164]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [173]:
# only drop columns with at least 5 NaN values
#threshold is minimum not nan values

df.dropna(thresh=6, axis=1)

     c1    c2    c3
a   0.0   1.0   2.0
b   3.0   4.0   5.0
c   6.0   7.0   8.0
d   9.0  10.0  11.0
e  12.0  13.0  14.0
f  15.0  16.0  17.0
g   NaN   NaN   NaN

In [174]:
df.dropna(thresh=4, axis=0)

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
f  15.0  16.0  17.0  18.0 NaN

In [175]:
df.dropna(thresh=2, axis=1)

     c1    c2    c3    c4
a   0.0   1.0   2.0  20.0
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

In [176]:
# only drop columns with at least 5 NaN values
#threshold is minimum not nan values

df.dropna(thresh=6, axis=0)

Empty DataFrame
Columns: [c1, c2, c3, c4, c5]
Index: []

In [177]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [None]:
# only drop columns with at least 5 NaN values
#threshold is minimum not nan values

df.dropna(thresh=3, axis=0)

In [178]:
# NaN's don't count as an item in calculating
# the means
df.mean()

c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [179]:
df.mean().mean()

11.125

In [181]:
# return a new DataFrame with NaN's filled with 0
filled = df.fillna(1.0)
filled

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   1.0  1.0
c   6.0   7.0   8.0   1.0  1.0
d   9.0  10.0  11.0   1.0  1.0
e  12.0  13.0  14.0   1.0  1.0
f  15.0  16.0  17.0  18.0  1.0
g   1.0   1.0   1.0   1.0  1.0

In [182]:
# having replaced NaN with 0 can make
# operations such as mean have different results
filled.mean()

c1    6.571429
c2    7.428571
c3    8.285714
c4    6.142857
c5    1.000000
dtype: float64

In [186]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [187]:
# only fills the first two NaN's in each row with 0
fill_2 = df.fillna(1.0, limit=3, axis = 0 )
print(fill_2)

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   1.0  1.0
c   6.0   7.0   8.0   1.0  1.0
d   9.0  10.0  11.0   1.0  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   1.0   1.0   1.0   NaN  NaN


In [188]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [191]:
# only fills the first two NaN's in each row with 0
fill_2 = df.fillna(0, limit=3, axis = 1)
print(fill_2)

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  0.0
b   3.0   4.0   5.0   0.0  0.0
c   6.0   7.0   8.0   0.0  0.0
d   9.0  10.0  11.0   0.0  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   0.0   0.0   0.0   NaN  NaN


In [192]:
# only fills the first two NaN's in each row with 0
fill_2 = df.fillna("#", limit=5)
print(df)
fill_2


     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN


   c1  c2  c3  c4   c5
a   0   1   2  20    #
b   3   4   5   #    #
c   6   7   8   #    #
d   9  10  11   #    #
e  12  13  14   #    #
f  15  16  17  18  NaN
g   #   #   #   #  NaN

In [193]:
df3 = df2.copy()
df3
df3["c6"] = np.NaN
df3["c7"] = np.NaN
df3["c8"] = np.NaN
df3["c9"] = np.NaN
df3.c1 = np.NaN

In [194]:
df3

   c1    c2    c3    c4  c5  c6  c7  c8  c9
a NaN   1.0   2.0  20.0 NaN NaN NaN NaN NaN
b NaN   4.0   5.0   NaN NaN NaN NaN NaN NaN
c NaN   7.0   8.0   NaN NaN NaN NaN NaN NaN
d NaN  10.0  11.0   NaN NaN NaN NaN NaN NaN
e NaN  13.0  14.0   NaN NaN NaN NaN NaN NaN
f NaN  16.0  17.0  18.0 NaN NaN NaN NaN NaN
g NaN   NaN   0.0   NaN NaN NaN NaN NaN NaN

In [195]:
# only fills the first two NaN's in each row with 0
fill_2 = df3.fillna(0, limit=2)
fill_2

    c1    c2    c3    c4   c5   c6   c7   c8   c9
a  0.0   1.0   2.0  20.0  0.0  0.0  0.0  0.0  0.0
b  0.0   4.0   5.0   0.0  0.0  0.0  0.0  0.0  0.0
c  NaN   7.0   8.0   0.0  NaN  NaN  NaN  NaN  NaN
d  NaN  10.0  11.0   NaN  NaN  NaN  NaN  NaN  NaN
e  NaN  13.0  14.0   NaN  NaN  NaN  NaN  NaN  NaN
f  NaN  16.0  17.0  18.0  NaN  NaN  NaN  NaN  NaN
g  NaN   0.0   0.0   NaN  NaN  NaN  NaN  NaN  NaN

In [196]:
# only fills the first two NaN's in each row with 0
fill_3 = df3.fillna(1, limit=5)
print(fill_3)
fill_3


    c1    c2    c3    c4   c5   c6   c7   c8   c9
a  1.0   1.0   2.0  20.0  1.0  1.0  1.0  1.0  1.0
b  1.0   4.0   5.0   1.0  1.0  1.0  1.0  1.0  1.0
c  1.0   7.0   8.0   1.0  1.0  1.0  1.0  1.0  1.0
d  1.0  10.0  11.0   1.0  1.0  1.0  1.0  1.0  1.0
e  1.0  13.0  14.0   1.0  1.0  1.0  1.0  1.0  1.0
f  NaN  16.0  17.0  18.0  NaN  NaN  NaN  NaN  NaN
g  NaN   1.0   0.0   1.0  NaN  NaN  NaN  NaN  NaN


    c1    c2    c3    c4   c5   c6   c7   c8   c9
a  1.0   1.0   2.0  20.0  1.0  1.0  1.0  1.0  1.0
b  1.0   4.0   5.0   1.0  1.0  1.0  1.0  1.0  1.0
c  1.0   7.0   8.0   1.0  1.0  1.0  1.0  1.0  1.0
d  1.0  10.0  11.0   1.0  1.0  1.0  1.0  1.0  1.0
e  1.0  13.0  14.0   1.0  1.0  1.0  1.0  1.0  1.0
f  NaN  16.0  17.0  18.0  NaN  NaN  NaN  NaN  NaN
g  NaN   1.0   0.0   1.0  NaN  NaN  NaN  NaN  NaN

In [None]:
df3

## Filling in missing data

In [197]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [198]:
# extract the c4 column and fill NaNs forward
df.c4.fillna(method="ffill")

a    20.0
b    20.0
c    20.0
d    20.0
e    20.0
f    18.0
g    18.0
Name: c4, dtype: float64

In [200]:
# perform a backwards fill
df.c4.fillna(method="bfill")

a    20.0
b    18.0
c    18.0
d    18.0
e    18.0
f    18.0
g     NaN
Name: c4, dtype: float64

# Visualizing Data

#### Data Frame Plot
<a href="https://pandas.pydata.org/pandas-docs/stable/reference/frame.html#plotting">Generating Various charts directly from DataFrame</a>

In [201]:
# plot the Adj Close values we just read in
goog.plot(y='Adj Close', figsize=(12,8));
plt.savefig('5128OS_01_02.png', bbox_inches='tight', dpi=300)

NameError: name 'goog' is not defined