#### Installing Pandas
pip install pandas 

### Reference Link for pandas
<a href="https://pandas.pydata.org/pandas-docs/stable/reference/index.html">API reference Pandas</a>

# Importing pandas into your application

In [3]:
# import numpy and pandas, and DataFrame / Series
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

# Set some pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# And some items for matplotlib
#%matplotlib inline 
#import matplotlib.pyplot as plt
#pd.options.display.mpl_style = 'default'

# Primary pandas objects

## The pandas Series

In [2]:
# create a four item Series
s = Series([5, 8, 3, 4])
s

0    5
1    8
2    3
3    4
dtype: int64

In [3]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
s[1:3]

1    8
2    3
dtype: int64

In [5]:
# return a Series with the row with labels 1 and 3
s[[1,3,0]]

1    8
3    4
0    5
dtype: int64

In [7]:
# create a series using an explicit index
s = Series([5, 8, 3, 4], index = ['a', 'b', 'c', 'd'])
s

a    5
b    8
c    3
d    4
dtype: int64

In [8]:
s['a':'d']
#s[0:3]

a    5
b    8
c    3
d    4
dtype: int64

In [9]:
# look up items the series having index 'a' and 'd'
s[['b', 'd']]

b    8
d    4
dtype: int64

In [10]:
# passing a list of integers to a Series that has
# non-integer index labels will look up based upon
# 0-based index like an array
s[[1, 3]]

b    8
d    4
dtype: int64

In [11]:
# get only the index of the Series
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [16]:
# create a Series who's index is a series of dates
# between the two specified dates (inclusive)
dates1 = pd.date_range('2021-03-01', '2021-03-06')
dates2 = pd.date_range('2021-03-02', '2021-03-07')
print(dates1)
print(dates2)

# create a Series with values(representing temperatures)
# for each date in the index
temps1 = Series([80, 82, 85, 90, 83, 87], 
                index = dates1)
print(temps1)
print(temps1.index)


DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05',
               '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')
2021-03-01    80
2021-03-02    82
2021-03-03    85
2021-03-04    90
2021-03-05    83
2021-03-06    87
Freq: D, dtype: int64
DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq='D')


In [17]:
# calculate the mean of the values in the Series
temps1.mean()

84.5

In [18]:
temps1.max()

90

In [19]:
temps1.min()

80

In [20]:
temps1.sum()

507

In [21]:
temps1.cumsum()

2021-03-01     80
2021-03-02    162
2021-03-03    247
2021-03-04    337
2021-03-05    420
2021-03-06    507
Freq: D, dtype: int64

In [22]:
# create a second series of values using the same index
temps2 = Series([70, 75, 69, 83, 79, 77], 
                index = dates2)
print(temps2)
print(temps2.index)

2021-03-02    70
2021-03-03    75
2021-03-04    69
2021-03-05    83
2021-03-06    79
2021-03-07    77
Freq: D, dtype: int64
DatetimeIndex(['2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05',
               '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')


In [23]:
print(temps1)

2021-03-01    80
2021-03-02    82
2021-03-03    85
2021-03-04    90
2021-03-05    83
2021-03-06    87
Freq: D, dtype: int64


In [24]:
print(temps2)

2021-03-02    70
2021-03-03    75
2021-03-04    69
2021-03-05    83
2021-03-06    79
2021-03-07    77
Freq: D, dtype: int64


In [29]:

# the following aligns the two by their index values
# and calculates the difference at those matching labels
temp_diffs = temps1 - temps2
print(type(temp_diffs))
temp_diffs
print(temp_diffs, type(temp_diffs))
print("###################################")
print(temp_diffs.index)

<class 'pandas.core.series.Series'>
2021-03-01     NaN
2021-03-02    12.0
2021-03-03    10.0
2021-03-04    21.0
2021-03-05     0.0
2021-03-06     8.0
2021-03-07     NaN
Freq: D, dtype: float64 <class 'pandas.core.series.Series'>
###################################
DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')


In [35]:
# lookup a value by date using the index
temp_diffs['2021-03-03']

10.0

In [36]:
# and also possible by integer position as if the 
# series was an array
temp_diffs[3]

21.0

## The pandas DataFrame

In [38]:
# create a DataFrame from the two series objects temp1 and temp2
# and give them column names
temps_df = DataFrame(
            {'Missoula': temps1, 
             'Philadelphia': temps2})
print(type(temps_df))
temps_df

<class 'pandas.core.frame.DataFrame'>


            Missoula  Philadelphia
2021-03-01      80.0           NaN
2021-03-02      82.0          70.0
2021-03-03      85.0          75.0
2021-03-04      90.0          69.0
2021-03-05      83.0          83.0
2021-03-06      87.0          79.0
2021-03-07       NaN          77.0

In [48]:
temps_df.describe()

        Missoula  Philadelphia  Difference         sum
count   6.000000      6.000000    5.000000    5.000000
mean   84.500000     75.500000   10.200000  160.600000
std     3.619392      5.357238    7.563068    5.813777
min    80.000000     69.000000    0.000000  152.000000
25%    82.250000     71.250000    8.000000  159.000000
50%    84.000000     76.000000   10.000000  160.000000
75%    86.500000     78.500000   12.000000  166.000000
max    90.000000     83.000000   21.000000  166.000000

In [39]:
# get the column with the name Missoula
temps_df['Missoula']

2021-03-01    80.0
2021-03-02    82.0
2021-03-03    85.0
2021-03-04    90.0
2021-03-05    83.0
2021-03-06    87.0
2021-03-07     NaN
Freq: D, Name: Missoula, dtype: float64

In [37]:
# likewise we can get just the Philadelphia column
print(type(temps_df['Philadelphia']))
temps_df['Philadelphia']

<class 'pandas.core.series.Series'>


2020-03-01     NaN
2020-03-02    70.0
2020-03-03    75.0
2020-03-04    69.0
2020-03-05    83.0
2020-03-06    79.0
2020-03-07    77.0
Freq: D, Name: Philadelphia, dtype: float64

In [40]:
# return both columns in a different order
x = temps_df[['Philadelphia']]
print(type(x))
print(x)

<class 'pandas.core.frame.DataFrame'>
            Philadelphia
2021-03-01           NaN
2021-03-02          70.0
2021-03-03          75.0
2021-03-04          69.0
2021-03-05          83.0
2021-03-06          79.0
2021-03-07          77.0


In [39]:
x = temps_df['Missoula']
print(type(x))
print(x)

<class 'pandas.core.series.Series'>
2020-03-01    80.0
2020-03-02    82.0
2020-03-03    85.0
2020-03-04    90.0
2020-03-05    83.0
2020-03-06    87.0
2020-03-07     NaN
Freq: D, Name: Missoula, dtype: float64


In [40]:
print(temps_df.Missoula)

2020-03-01    80.0
2020-03-02    82.0
2020-03-03    85.0
2020-03-04    90.0
2020-03-05    83.0
2020-03-06    87.0
2020-03-07     NaN
Freq: D, Name: Missoula, dtype: float64


In [42]:
print(temps_df.Missoula['2021-03-03'])
print("###########################################")
print(temps_df.Missoula['2021-03-03':'2021-03-05'])
print("###########################################")
print(temps_df.Missoula.index)
print(type(temps_df.Missoula))
s = temps_df.Missoula
#print(s[['2019-07-03','2019-07-05'], dtype = 'datetime64[ns]'])

85.0
###########################################
2021-03-03    85.0
2021-03-04    90.0
2021-03-05    83.0
Freq: D, Name: Missoula, dtype: float64
###########################################
DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')
<class 'pandas.core.series.Series'>


In [44]:
# retrieve the Missoula column through property syntax
print(type(temps_df.Missoula))
temps_df.Missoula

<class 'pandas.core.series.Series'>


2021-03-01    80.0
2021-03-02    82.0
2021-03-03    85.0
2021-03-04    90.0
2021-03-05    83.0
2021-03-06    87.0
2021-03-07     NaN
Freq: D, Name: Missoula, dtype: float64

In [43]:
# calculate the temperature difference between the two cities
temps_df.Missoula - temps_df.Philadelphia

2021-03-01     NaN
2021-03-02    12.0
2021-03-03    10.0
2021-03-04    21.0
2021-03-05     0.0
2021-03-06     8.0
2021-03-07     NaN
Freq: D, dtype: float64

In [45]:
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temp_diffs
temps_df['sum'] = temps_df.Missoula + temps_df.Philadelphia
print(temps_df)
print(temps_df.index)
temps_df[2:7:2]

            Missoula  Philadelphia  Difference    sum
2021-03-01      80.0           NaN         NaN    NaN
2021-03-02      82.0          70.0        12.0  152.0
2021-03-03      85.0          75.0        10.0  160.0
2021-03-04      90.0          69.0        21.0  159.0
2021-03-05      83.0          83.0         0.0  166.0
2021-03-06      87.0          79.0         8.0  166.0
2021-03-07       NaN          77.0         NaN    NaN
DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')


            Missoula  Philadelphia  Difference    sum
2021-03-03      85.0          75.0        10.0  160.0
2021-03-05      83.0          83.0         0.0  166.0
2021-03-07       NaN          77.0         NaN    NaN

In [46]:
# get the columns, which is also an Index object
temps_df.columns
#temps_df.Difference1
#temps_df["Difference1"]

Index(['Missoula', 'Philadelphia', 'Difference', 'sum'], dtype='object')

In [57]:
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:6]

2020-03-02    12.0
2020-03-03    10.0
2020-03-04    21.0
2020-03-05     0.0
2020-03-06     8.0
Freq: D, Name: Difference, dtype: float64

In [67]:
print(temps_df)
print("##############")
# get the row at array position 1
print((temps_df.iloc[6]))
print("##############")
temps_df.iloc[6] #particular row index
print("#######4:8#######")
print(temps_df.iloc[4:8]) # from to To Row
#include step
temps_df.iloc[2:7:2] #from to To with step

            Missoula  Philadelphia  Difference    sum
2020-03-01      80.0           NaN         NaN    NaN
2020-03-02      82.0          70.0        12.0  152.0
2020-03-03      85.0          75.0        10.0  160.0
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0
2020-03-07       NaN          77.0         NaN    NaN
##############
Missoula         NaN
Philadelphia    77.0
Difference       NaN
sum              NaN
Name: 2020-03-07 00:00:00, dtype: float64
##############
#######4:8#######
            Missoula  Philadelphia  Difference    sum
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0
2020-03-07       NaN          77.0         NaN    NaN


            Missoula  Philadelphia  Difference    sum
2020-03-03      85.0          75.0        10.0  160.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-07       NaN          77.0         NaN    NaN

In [45]:
# get the row at array position 1
print(type(temps_df.iloc[1:6]))
temps_df.iloc[1:6]

<class 'pandas.core.frame.DataFrame'>


            Missoula  Philadelphia  Difference  Difference1
2019-07-02      82.0          70.0        12.0         12.0
2019-07-03      85.0          75.0        10.0         10.0
2019-07-04      90.0          69.0        21.0         21.0
2019-07-05      83.0          83.0         0.0          0.0
2019-07-06      87.0          79.0         8.0          8.0

In [46]:
temps_df.iloc[1:5:2]

            Missoula  Philadelphia  Difference  Difference1
2019-07-02      82.0          70.0        12.0         12.0
2019-07-04      90.0          69.0        21.0         21.0

In [68]:
temps_df
#temps_df.loc['2019-07-04':'2019-07-07':2]

            Missoula  Philadelphia  Difference    sum
2020-03-01      80.0           NaN         NaN    NaN
2020-03-02      82.0          70.0        12.0  152.0
2020-03-03      85.0          75.0        10.0  160.0
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0
2020-03-07       NaN          77.0         NaN    NaN

In [69]:
# the names of the columns have become the index
# they have been 'pivoted'
print(temps_df)
temps_df.ix[1:5:3]

            Missoula  Philadelphia  Difference    sum
2020-03-01      80.0           NaN         NaN    NaN
2020-03-02      82.0          70.0        12.0  152.0
2020-03-03      85.0          75.0        10.0  160.0
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0
2020-03-07       NaN          77.0         NaN    NaN


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


            Missoula  Philadelphia  Difference    sum
2020-03-02      82.0          70.0        12.0  152.0
2020-03-05      83.0          83.0         0.0  166.0

In [79]:
# retrieve row by index label using .loc
print(temps_df.loc['2020-03-03'])
temps_df.loc['2020-03-03','sum']

Missoula         85.0
Philadelphia     75.0
Difference       10.0
sum             160.0
Name: 2020-03-03 00:00:00, dtype: float64


160.0

In [80]:
temps_df

            Missoula  Philadelphia  Difference    sum
2020-03-01      80.0           NaN         NaN    NaN
2020-03-02      82.0          70.0        12.0  152.0
2020-03-03      85.0          75.0        10.0  160.0
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0
2020-03-07       NaN          77.0         NaN    NaN

In [84]:
temps_df.Difference[[1,3,5]]

2020-03-02    12.0
2020-03-04    21.0
2020-03-06     8.0
Name: Difference, dtype: float64

In [85]:
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference

2020-03-02    12.0
2020-03-04    21.0
2020-03-06     8.0
Freq: 2D, Name: Difference, dtype: float64

In [86]:
temps_df>82

            Missoula  Philadelphia  Difference    sum
2020-03-01     False         False       False  False
2020-03-02     False         False       False   True
2020-03-03      True         False       False   True
2020-03-04      True         False       False   True
2020-03-05      True          True       False   True
2020-03-06      True         False       False   True
2020-03-07     False         False       False  False

In [88]:
temps_df[temps_df>82]

            Missoula  Philadelphia  Difference    sum
2020-03-01       NaN           NaN         NaN    NaN
2020-03-02       NaN           NaN         NaN  152.0
2020-03-03      85.0           NaN         NaN  160.0
2020-03-04      90.0           NaN         NaN  159.0
2020-03-05      83.0          83.0         NaN  166.0
2020-03-06      87.0           NaN         NaN  166.0
2020-03-07       NaN           NaN         NaN    NaN

In [89]:
# which values in the Missoula column are > 82?
temps_df.Missoula > 82

2020-03-01    False
2020-03-02    False
2020-03-03     True
2020-03-04     True
2020-03-05     True
2020-03-06     True
2020-03-07    False
Freq: D, Name: Missoula, dtype: bool

In [90]:
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]

            Missoula  Philadelphia  Difference    sum
2020-03-03      85.0          75.0        10.0  160.0
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0

In [91]:
###### provides default 5 values from top
temps_df.head()

            Missoula  Philadelphia  Difference    sum
2020-03-01      80.0           NaN         NaN    NaN
2020-03-02      82.0          70.0        12.0  152.0
2020-03-03      85.0          75.0        10.0  160.0
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0

In [92]:
###### provides default 5 values from top
temps_df.head(2)

            Missoula  Philadelphia  Difference    sum
2020-03-01      80.0           NaN         NaN    NaN
2020-03-02      82.0          70.0        12.0  152.0

In [93]:
###### provides default 5 values from top
temps_df.tail(4)

            Missoula  Philadelphia  Difference    sum
2020-03-04      90.0          69.0        21.0  159.0
2020-03-05      83.0          83.0         0.0  166.0
2020-03-06      87.0          79.0         8.0  166.0
2020-03-07       NaN          77.0         NaN    NaN

# Loading data from files and the web into a DataFrame

In [229]:
# display the contents of test1.csv
# which command to use depends on your OS
!cat data/test1.csv # on non-windows systems
#!type data/test1.csv # on windows systems

'cat' is not recognized as an internal or external command,
operable program or batch file.


###### Accessing Data

### Reading Data From HTML

In [None]:
pandas.read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, thousands=', ', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True)

In [102]:
import html5lib
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
#pd.read_html()
dfs = pd.read_html(url)

ImportError: lxml not found, please install it

### Reading Data From CLIPBOARD

In [1]:
x = pd.read_clipboard()
print(x)

NameError: name 'pd' is not defined

In [60]:
# read the contents of the file into a DataFrame
df = pd.read_csv('test1.csv')
df

                  date         0         1         2
0  2000-01-01 00:00:00  1.103763 -1.909979 -0.808956
1  2000-01-02 00:00:00  1.188917  0.581120  0.861597
2  2000-01-03 00:00:00 -0.964200  0.779764  1.829062
3  2000-01-04 00:00:00  0.782130 -1.720670 -1.108242
4  2000-01-05 00:00:00 -1.867017 -0.528368 -2.488309
5  2000-01-06 00:00:00  2.569280 -0.471901 -0.835033
6  2000-01-07 00:00:00 -0.399323 -0.676427 -0.011256
7  2000-01-08 00:00:00  1.642993  1.013420  1.435667
8  2000-01-09 00:00:00  1.147308  2.138000  0.554171
9  2000-01-10 00:00:00  0.933766  1.387155 -0.560143

In [61]:
df.columns

Index(['date', '0', '1', '2'], dtype='object')

In [62]:
# the contents of the date column
df[["date","2"]]

                  date         2
0  2000-01-01 00:00:00 -0.808956
1  2000-01-02 00:00:00  0.861597
2  2000-01-03 00:00:00  1.829062
3  2000-01-04 00:00:00 -1.108242
4  2000-01-05 00:00:00 -2.488309
5  2000-01-06 00:00:00 -0.835033
6  2000-01-07 00:00:00 -0.011256
7  2000-01-08 00:00:00  1.435667
8  2000-01-09 00:00:00  0.554171
9  2000-01-10 00:00:00 -0.560143

In [63]:
# we can get the first value in the date column
print(type(df.date[0]))
df.date[0]

<class 'str'>


'2000-01-01 00:00:00'

In [64]:
# it is a string
type(df.date[0])

str

In [65]:
print(df.date)
type(df.date)


0    2000-01-01 00:00:00
1    2000-01-02 00:00:00
2    2000-01-03 00:00:00
3    2000-01-04 00:00:00
4    2000-01-05 00:00:00
5    2000-01-06 00:00:00
6    2000-01-07 00:00:00
7    2000-01-08 00:00:00
8    2000-01-09 00:00:00
9    2000-01-10 00:00:00
Name: date, dtype: object


pandas.core.series.Series

In [66]:
# read the data and tell pandas the date column should be 
# a date in the resulting DataFrame
df = pd.read_csv('test1.csv', parse_dates=['date'])
print(df.date[0])
type(df.date[0])

2000-01-01 00:00:00


pandas._libs.tslibs.timestamps.Timestamp

In [67]:
df.index

RangeIndex(start=0, stop=10, step=1)

In [68]:
# verify the type now is date
# in pandas, this is actually a Timestamp
type(df.date[0])

pandas._libs.tslibs.timestamps.Timestamp

In [69]:
# unfortunately the index is numeric which makes
# accessing data by date more complicated
df.index

RangeIndex(start=0, stop=10, step=1)

In [19]:
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv('test1.csv', parse_dates=['date'], index_col='date')
print(df)
print(df.index)

                   0         1         2
date                                    
2000-01-01  1.103763 -1.909979 -0.808956
2000-01-02  1.188917  0.581120  0.861597
2000-01-03 -0.964200  0.779764  1.829062
2000-01-04  0.782130 -1.720670 -1.108242
2000-01-05 -1.867017 -0.528368 -2.488309
2000-01-06  2.569280 -0.471901 -0.835033
2000-01-07 -0.399323 -0.676427 -0.011256
2000-01-08  1.642993  1.013420  1.435667
2000-01-09  1.147308  2.138000  0.554171
2000-01-10  0.933766  1.387155 -0.560143
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10'],
              dtype='datetime64[ns]', name='date', freq=None)


In [20]:
# and the index is now a DatetimeIndex
df.index



DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10'],
              dtype='datetime64[ns]', name='date', freq=None)

In [21]:
df.columns

Index(['0', '1', '2'], dtype='object')

In [75]:
# use column 0 as the index
msft = pd.read_csv("msft.csv")
print(msft.head())
msft.describe()

         Date   Open   High    Low  Close   Volume  Adj Close
0  2019-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2019-07-18  83.46  83.40  82.52  83.35  4020800      83.35
2  2019-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2019-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2019-07-15  84.30  84.38  83.20  83.58  1874700      83.58


              Open         High          Low        Close        Volume  \
count  3767.000000  3767.000000  3767.000000  3767.000000  3.767000e+03   
mean     55.208216    55.811001    54.562599    55.226161  4.936915e+06   
std      17.707426    17.780680    17.669009    17.722486  3.174734e+06   
min      20.690000    20.870000    17.850000    19.950000  0.000000e+00   
25%      42.405000    42.890000    41.850000    42.415000  3.066700e+06   
50%      49.990000    50.600000    49.310000    49.970000  4.482900e+06   
75%      65.995000    66.785000    65.405000    65.965000  6.072700e+06   
max     105.000000   105.740000   103.610000   104.820000  5.086050e+07   

         Adj Close  
count  3767.000000  
mean     33.259557  
std      19.001068  
min      11.070000  
25%      17.810000  
50%      25.020000  
75%      43.720000  
max      87.360000  

In [72]:
msft = pd.read_csv("msft.csv", index_col=2)
print(msft.head())

         Date   Open   High    Low  Close   Volume  Adj Close
0  2019-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2019-07-18  83.46  83.40  82.52  83.35  4020800      83.35
2  2019-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2019-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2019-07-15  84.30  84.38  83.20  83.58  1874700      83.58
             Date   Open    Low  Close   Volume  Adj Close
High                                                      
83.53  2019-07-21  83.46  81.81  81.93  2359300      81.93
83.40  2019-07-18  83.46  82.52  83.35  4020800      83.35
84.63  2019-07-17  84.35  83.33  83.63  1974000      83.63
84.91  2019-07-16  83.77  83.66  84.91  1755600      84.91
84.38  2019-07-15  84.30  83.20  83.58  1874700      83.58


In [76]:
# examine the types of the columns in this DataFrame
msft.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume         int64
Adj Close    float64
dtype: object

In [4]:
# specify that the Volume column should be a float64
msft = pd.read_csv("msft.csv", parse_dates = ['Date'], dtype = { 'Volume' : np.int32, 'Low': str}, index_col = 'Close')
print(msft.dtypes)
print(msft)


Date         datetime64[ns]
Open                float64
High                float64
Low                  object
Volume                int32
Adj Close           float64
dtype: object
            Date   Open   High    Low   Volume  Adj Close
Close                                                    
81.93 2019-07-21  83.46  83.53  81.81  2359300      81.93
83.35 2019-07-18  83.46  83.40  82.52  4020800      83.35
83.63 2019-07-17  84.35  84.63  83.33  1974000      83.63
84.91 2019-07-16  83.77  84.91  83.66  1755600      84.91
83.58 2019-07-15  84.30  84.38  83.20  1874700      83.58
...          ...    ...    ...    ...      ...        ...
50.00 2000-01-07  48.55  50.35  47.80  4621200      19.48
48.03 2000-01-06  46.78  48.35  46.28  3306100      18.72
46.75 2000-01-05  46.94  47.50  45.92  4809900      18.22
47.85 2000-01-04  49.80  49.80  47.72  4489500      18.65
49.75 2000-01-03  52.70  53.20  49.60  3137300      19.39

[3767 rows x 6 columns]


In [15]:
msft = pd.read_csv("msft.csv", dtype = {'High': str ,'Volume' : np.float64})
print(msft.dtypes)
print("################################################")
msft.Open= msft.Open.astype(np.int64,errors="ignore") # errors ="ignore" or "raise"
print(msft.Open.head(10))
print(msft.Open.dtypes)
print("################################################")
print(msft.dtypes)
print("################################################")
print(msft.head(20))

Date          object
Open         float64
High          object
Low          float64
Close        float64
Volume       float64
Adj Close    float64
dtype: object
################################################
0    83
1    83
2    84
3    83
4    84
5    83
6    83
7    85
8    84
9    86
Name: Open, dtype: int64
int64
################################################
Date          object
Open           int64
High          object
Low          float64
Close        float64
Volume       float64
Adj Close    float64
dtype: object
################################################
          Date  Open   High    Low  Close     Volume  Adj Close
0   2019-07-21    83  83.53  81.81  81.93  2359300.0      81.93
1   2019-07-18    83  83.40  82.52  83.35  4020800.0      83.35
2   2019-07-17    84  84.63  83.33  83.63  1974000.0      83.63
3   2019-07-16    83  84.91  83.66  84.91  1755600.0      84.91
4   2019-07-15    84  84.38  83.20  83.58  1874700.0      83.58
5   2019-07-14    83  84.64  83.11  

## Specifying column names

In [16]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=1 skips the header row
df = pd.read_csv("msft.csv", header=3)
print(df.head(10))
df.columns
#df.columns['2019-07-17']

   2019-07-17  84.35  84.63  83.33  83.63  1974000  83.63.1
0  2019-07-16  83.77  84.91  83.66  84.91  1755600    84.91
1  2019-07-15  84.30  84.38  83.20  83.58  1874700    83.58
2  2019-07-14  83.66  84.64  83.11  84.40  1432100    84.40
3  2019-07-11  83.55  83.98  82.85  83.35  2001400    83.35
4  2019-07-10  85.20  85.57  83.36  83.42  2713300    83.42
5  2019-07-09  84.83  85.79  84.76  85.50  1540700    85.50
6  2019-07-08  86.29  86.57  84.69  84.69  2164000    84.69
7  2019-07-07  86.97  87.13  85.82  86.07  1644600    86.07
8  2019-07-04  87.22  87.54  87.05  87.05  1392600    87.05
9  2019-07-03  86.02  87.42  85.88  87.14  1985800    87.14


Index(['2019-07-17', '84.35', '84.63', '83.33', '83.63', '1974000', '83.63.1'], dtype='object')

In [44]:
df = pd.read_csv("msft.csv",header=3,names=['A', 'B', 'C','D', 'E'])
print(df.head(10))
#print(df.columns)
#print(df.index)
#df.A
#df.A[['2019-07-16','83.77']]
df.A[['2019-07-16']]
#df.A['2019-07-16']


                      A      B      C        D      E
2019-07-16 83.77  84.91  83.66  84.91  1755600  84.91
2019-07-15 84.30  84.38  83.20  83.58  1874700  83.58
2019-07-14 83.66  84.64  83.11  84.40  1432100  84.40
2019-07-11 83.55  83.98  82.85  83.35  2001400  83.35
2019-07-10 85.20  85.57  83.36  83.42  2713300  83.42
2019-07-09 84.83  85.79  84.76  85.50  1540700  85.50
2019-07-08 86.29  86.57  84.69  84.69  2164000  84.69
2019-07-07 86.97  87.13  85.82  86.07  1644600  86.07
2019-07-04 87.22  87.54  87.05  87.05  1392600  87.05
2019-07-03 86.02  87.42  85.88  87.14  1985800  87.14


2019-07-16  83.77    84.91
Name: A, dtype: float64

In [86]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=0 skips the header row
df = pd.read_csv("msft.csv",header=4)
df.head(100)
df.index

RangeIndex(start=0, stop=3763, step=1)

### Specifying specific columns to load

In [18]:
# read in data only in the Date and Close columns
# and index by the Date column
df2 = pd.read_csv("msft.csv", parse_dates=['Date'],usecols=['Date', 'Close','Low'] , index_col=['Date'])
df2.head(5)

#names=['A', 'B', 'C','D', 'E']

              Low  Close
Date                    
2019-07-21  81.81  81.93
2019-07-18  82.52  83.35
2019-07-17  83.33  83.63
2019-07-16  83.66  84.91
2019-07-15  83.20  83.58

In [4]:
# read in data only in the Date and Close columns
# and index by the Date column
df2 = pd.read_csv("msft.csv", parse_dates=['A'],usecols=['A', 'B','C'] ,header =1, names=['A', 'B', 'C','D', 'E', 'F','G'], index_col=['A'])
df2.head(5)

#names=['A', 'B', 'C','D', 'E']

                B      C
A                       
2019-07-18  83.46  83.40
2019-07-17  84.35  84.63
2019-07-16  83.77  84.91
2019-07-15  84.30  84.38
2019-07-14  83.66  84.64

## Saving a DataFrame to a CSV

In [6]:
# save df2 to a new csv file
# also specify naming the index as date
df2.to_excel("myfile1.xls",  index_label='A')
#df2.to_excel("myfile2.xls", sheet_name='mysheet1', index_label='date')
df2.to_excel("myfile3.xls", sheet_name='mysheet2', index_label='A')

df2.to_csv("msft_modified.csv", index_label='A')

#### same file with different sheets

In [7]:
with pd.ExcelWriter('myfile.xls') as writer:  # doctest: +SKIP
    df2.to_excel(writer, sheet_name='Sheet_name_1')
    df2.to_excel(writer, sheet_name='Sheet_name_2')

In [7]:
# view the start of the file just saved
!head data/msft_modified.csv
#type data/msft_modified.csv # windows

'head' is not recognized as an internal or external command,
operable program or batch file.


# Working with missing data

## Setup

In [10]:
import pandas as pd
import numpy as np
print(np.arange(0, 15).reshape(5, 3))
# create a DataFrame with 5 rows and 3 columns
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), 
               index=['a', 'b', 'c', 'd', 'e'], 
               columns=['c1', 'c2', 'c3'])
df

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]]


   c1  c2  c3
a   0   1   2
b   3   4   5
c   6   7   8
d   9  10  11
e  12  13  14

In [11]:
# add some columns and rows to the DataFrame
# column c4 with NaN values
df['c4'] = np.nan
df

   c1  c2  c3  c4
a   0   1   2 NaN
b   3   4   5 NaN
c   6   7   8 NaN
d   9  10  11 NaN
e  12  13  14 NaN

In [12]:
# row 'f' with 15 through 18 
df.loc['f'] = np.arange(15, 19) 
# row 'g' will all NaN
print(df)

   c1  c2  c3    c4
a   0   1   2   NaN
b   3   4   5   NaN
c   6   7   8   NaN
d   9  10  11   NaN
e  12  13  14   NaN
f  15  16  17  18.0


In [13]:
df.loc['g'] = np.nan
# column 'C5' with NaN's
df

     c1    c2    c3    c4
a   0.0   1.0   2.0   NaN
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

In [14]:
df['c5'] = np.nan
# change value in col 'c4' row 'a'
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [15]:
df['c4']['a'] = 20
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [17]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

## Determining NaN values in Series and DataFrame objects

In [18]:
# which items are NaN?
print(df)
df.isnull()

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN


      c1     c2     c3     c4    c5
a  False  False  False  False  True
b  False  False  False   True  True
c  False  False  False   True  True
d  False  False  False   True  True
e  False  False  False   True  True
f  False  False  False  False  True
g   True   True   True   True  True

In [19]:
# total count of NaN values
nullsumforcolumns = df.isnull().sum()#.sum()
nullsumforcolumns

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [20]:
nullsumforcolumns.sum()

15

In [21]:
# number of non-NaN values in each column
df.count()
#df.notnull().sum()


c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [22]:
# which items are not null?
df.notnull()

      c1     c2     c3     c4     c5
a   True   True   True   True  False
b   True   True   True  False  False
c   True   True   True  False  False
d   True   True   True  False  False
e   True   True   True  False  False
f   True   True   True   True  False
g  False  False  False  False  False

In [24]:
# which items are not null?
df.notnull().sum()

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [27]:
# which items are not null?
df.notnull().sum().sum()

20

# Selecting out (dropping) missing data

In [26]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [30]:
df.notnull()

      c1     c2     c3     c4     c5
a   True   True   True   True  False
b   True   True   True  False  False
c   True   True   True  False  False
d   True   True   True  False  False
e   True   True   True  False  False
f   True   True   True   True  False
g  False  False  False  False  False

In [31]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [36]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [38]:
df.c3.notnull()

a     True
b     True
c     True
d     True
e     True
f     True
g    False
Name: c3, dtype: bool

In [40]:
df.c3[df.c3.notnull()]

a     2.0
b     5.0
c     8.0
d    11.0
e    14.0
f    17.0
Name: c3, dtype: float64

In [41]:
df[df.c3.notnull()]

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN

In [85]:
df.c3.isnull()

a    False
b    False
c    False
d    False
e    False
f    False
g     True
Name: c3, dtype: bool

In [86]:
df.c3[df.c3.isnull()]

g   NaN
Name: c3, dtype: float64

In [87]:
df[df.c3.isnull()]

   c1  c2  c3  c4  c5
g NaN NaN NaN NaN NaN

##### expression on dataframe applied to the dataframe itself

In [91]:
df.notnull()

      c1     c2     c3     c4     c5
a   True   True   True   True  False
b   True   True   True  False  False
c   True   True   True  False  False
d   True   True   True  False  False
e   True   True   True  False  False
f   True   True   True   True  False
g  False  False  False  False  False

In [90]:
df[df.notnull()]

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [94]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [95]:
df.isnull()

      c1     c2     c3     c4    c5
a  False  False  False  False  True
b  False  False  False   True  True
c  False  False  False   True  True
d  False  False  False   True  True
e  False  False  False   True  True
f  False  False  False  False  True
g   True   True   True   True  True

In [96]:
df[df.isnull()]

   c1  c2  c3  c4  c5
a NaN NaN NaN NaN NaN
b NaN NaN NaN NaN NaN
c NaN NaN NaN NaN NaN
d NaN NaN NaN NaN NaN
e NaN NaN NaN NaN NaN
f NaN NaN NaN NaN NaN
g NaN NaN NaN NaN NaN

In [97]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [98]:
df.c4.notnull()

a     True
b    False
c    False
d    False
e    False
f     True
g    False
Name: c4, dtype: bool

In [99]:
# select the non-NaN items in column c4
df.c4[df.c4.notnull()]

a    20.0
f    18.0
Name: c4, dtype: float64

In [100]:
df[df.c4.notnull()]

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
f  15.0  16.0  17.0  18.0 NaN

In [32]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [33]:
# .dropna will also return non NaN values
# this gets all non NaN items in column c4
df.c4.dropna()

a    20.0
f    18.0
Name: c4, dtype: float64

In [103]:
df

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   NaN   NaN   NaN   NaN  NaN

In [102]:
df['c5']['a'] = 1
df.dropna()

    c1   c2   c3    c4   c5
a  0.0  1.0  2.0  20.0  1.0

In [104]:
df['c1']['g'] = 1

In [105]:
df

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   1.0   NaN   NaN   NaN  NaN

In [106]:
df['c4']['a'] = 2

In [107]:
df.dropna()
### all the columns should be not null for a row 
## if one colum value is null or Nan for a row, dropna method will drop that row

    c1   c2   c3   c4   c5
a  0.0  1.0  2.0  2.0  1.0

In [108]:
# dropna returns a copy with the values dropped
# the source DataFrame / column is not changed
df.c4

a     2.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [110]:
# using how='all', only rows that have all values
# as NaN will be dropped
a = df.dropna(how = 'any')#default
print(a)
a = df.dropna(how = 'all')
print(a) 


    c1   c2   c3   c4   c5
a  0.0  1.0  2.0  2.0  1.0
     c1    c2    c3    c4   c5
a   0.0   1.0   2.0   2.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   1.0   NaN   NaN   NaN  NaN


In [111]:
df['c1']['g'] = np.nan

In [112]:
df

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0   2.0  1.0
b   3.0   4.0   5.0   NaN  NaN
c   6.0   7.0   8.0   NaN  NaN
d   9.0  10.0  11.0   NaN  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   NaN   NaN   NaN   NaN  NaN

In [44]:
# using how='all', only rows that have all values
# as NaN will be dropped
a = df.dropna(how = 'all')
a 

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN

In [42]:
df['c5'][0] = np.nan

In [43]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [46]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=1) # say goodbye to c5, axis = 1 column or "columns",
#axis = 0 for rows or "rows" 
#df.dropna(how='all', axis="rows") 

     c1    c2    c3    c4
a   0.0   1.0   2.0  20.0
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

In [47]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [48]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=0) # say goodbye to g
#df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN

In [50]:
# make a copy of df
df2 = df.copy()

# replace two NaN cells with values
df2.loc['g'].c1 = 0
df2.loc['g'].c3 = 0

#.ix is deprecated. Please use
#.loc for label based indexing or loc -> label a, b, c, 
#.iloc for positional indexing iloc -> index of labe a -> 0, b -> 1 -- 0,1,2,3...
df2

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   0.0   NaN   0.0   NaN NaN

In [51]:
# now drop columns with any NaN values
df2.dropna(how='any', axis=0) 

Empty DataFrame
Columns: [c1, c2, c3, c4, c5]
Index: []

In [131]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   2.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [52]:
#threshold is minimum not nan values
df.dropna(thresh=4, axis='rows')#'rows' or 'columns' can be given for axis
#df
#(df.dropna(thresh=4, axis='rows')).iloc[[0]].c4
#(df.dropna(thresh=4, axis='rows')).c4[1]

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
f  15.0  16.0  17.0  18.0 NaN

In [53]:
df.dropna(thresh=4, axis=0)

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
f  15.0  16.0  17.0  18.0 NaN

In [54]:
df.dropna(thresh=2, axis=1)

     c1    c2    c3    c4
a   0.0   1.0   2.0  20.0
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

In [55]:
# only drop columns with at least 5 NaN values
#threshold is minimum not nan values

df.dropna(thresh=6, axis=1)

     c1    c2    c3
a   0.0   1.0   2.0
b   3.0   4.0   5.0
c   6.0   7.0   8.0
d   9.0  10.0  11.0
e  12.0  13.0  14.0
f  15.0  16.0  17.0
g   NaN   NaN   NaN

In [56]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [57]:
# NaN's don't count as an item in calculating
# the means
df.mean(axis = 0)# 0 for rows, 1 for column
#df.mean(axis = "rows")# 0 for rows, 1 for column

c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [59]:
df.mean()

c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [60]:
df.mean().mean()

11.125

In [61]:
df.median()

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [62]:
df.median()

c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [63]:
df.median().median()

9.0

#### Filling Nan

In [64]:
# return a new DataFrame with NaN's filled with 0
filled = df.fillna(0.0)
filled


     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  0.0
b   3.0   4.0   5.0   0.0  0.0
c   6.0   7.0   8.0   0.0  0.0
d   9.0  10.0  11.0   0.0  0.0
e  12.0  13.0  14.0   0.0  0.0
f  15.0  16.0  17.0  18.0  0.0
g   0.0   0.0   0.0   0.0  0.0

In [65]:
# having replaced NaN with 0 can make
# operations such as mean have different results
filled.mean(axis="rows")#consider all the rows for each column

c1    6.428571
c2    7.285714
c3    8.142857
c4    5.428571
c5    0.000000
dtype: float64

In [66]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [72]:
#axis not supported -- works only based on column
# only fills the first two NaN's in each row with 0
#print(df)
print("####################################")
fill_2 = df.fillna(1.0, limit=3, axis = 'rows')#0)rows or index 
print(fill_2)

####################################
     c1    c2    c3    c4   c5
a   0.0   1.0   2.0  20.0  1.0
b   3.0   4.0   5.0   1.0  1.0
c   6.0   7.0   8.0   1.0  1.0
d   9.0  10.0  11.0   1.0  NaN
e  12.0  13.0  14.0   NaN  NaN
f  15.0  16.0  17.0  18.0  NaN
g   1.0   1.0   1.0   NaN  NaN


In [68]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [73]:
#DataFrame.replace(self, to_replace=None, value=None, inplace=False, limit=None, regex=False, method='pad')
#df_regex = df.replace("[0-9\.]+",1111.0,limit=3, regex = True)
df3 = df.copy()
df3['c1'] = 0.0
df3['c4']['e'] = 0.0
df3['c4']['c'] = 0.0
df3['c4']['a'] = 0.0
print(df3)

    c1    c2    c3    c4  c5
a  0.0   1.0   2.0   0.0 NaN
b  0.0   4.0   5.0   NaN NaN
c  0.0   7.0   8.0   0.0 NaN
d  0.0  10.0  11.0   NaN NaN
e  0.0  13.0  14.0   0.0 NaN
f  0.0  16.0  17.0  18.0 NaN
g  0.0   NaN   NaN   NaN NaN


In [192]:
df_regex = df3.replace(0.0,11.0, regex = False)#replace works over rows
print(df_regex)
#DataFrame.replace(df2,1.0,10.0)

     c1    c2    c3    c4  c5
a  11.0   1.0   2.0  11.0 NaN
b  11.0   4.0   5.0   NaN NaN
c  11.0   7.0   8.0  11.0 NaN
d  11.0  10.0  11.0   NaN NaN
e  11.0  13.0  14.0  11.0 NaN
f  11.0  16.0  17.0  18.0 NaN
g  11.0   NaN   NaN   NaN NaN


In [77]:
df_new = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
df_new.replace(to_replace=r'^ba.*$', value='new', regex=True)


     A    B
0  new  abc
1  foo  new
2  new  xyz

In [78]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [79]:
# only fills the first two NaN's in each row with 0
df.loc['e'] = 1
df['c1'][['c','d','e','f']] = np.nan
df['c6'] = np.nan

df.loc['d'] = np.nan
print(df)


    c1    c2    c3    c4   c5  c6
a  0.0   1.0   2.0  20.0  NaN NaN
b  3.0   4.0   5.0   NaN  NaN NaN
c  NaN   7.0   8.0   NaN  NaN NaN
d  NaN   NaN   NaN   NaN  NaN NaN
e  NaN   1.0   1.0   1.0  1.0 NaN
f  NaN  16.0  17.0  18.0  NaN NaN
g  NaN   NaN   NaN   NaN  NaN NaN


In [80]:
print("#######################################")#axis not influence
fill_2 = df.fillna(10, limit=3, axis = 'columns') # index or 0, # columns or 1
print(fill_2)


#######################################
     c1    c2    c3    c4    c5    c6
a   0.0   1.0   2.0  20.0  10.0  10.0
b   3.0   4.0   5.0  10.0  10.0  10.0
c  10.0   7.0   8.0  10.0  10.0  10.0
d  10.0  10.0  10.0  10.0   NaN   NaN
e  10.0   1.0   1.0   1.0   1.0   NaN
f   NaN  16.0  17.0  18.0   NaN   NaN
g   NaN  10.0  10.0   NaN   NaN   NaN


In [82]:
# only fills the first two NaN's in each row with 0
fill_2 = df.fillna("#", limit=5)
print(df)
fill_2


    c1    c2    c3    c4   c5  c6
a  0.0   1.0   2.0  20.0  NaN NaN
b  3.0   4.0   5.0   NaN  NaN NaN
c  NaN   7.0   8.0   NaN  NaN NaN
d  NaN   NaN   NaN   NaN  NaN NaN
e  NaN   1.0   1.0   1.0  1.0 NaN
f  NaN  16.0  17.0  18.0  NaN NaN
g  NaN   NaN   NaN   NaN  NaN NaN


  c1  c2  c3  c4   c5   c6
a  0   1   2  20    #    #
b  3   4   5   #    #    #
c  #   7   8   #    #    #
d  #   #   #   #    #    #
e  #   1   1   1    1    #
f  #  16  17  18    #  NaN
g  #   #   #   #  NaN  NaN

In [83]:
df3 = df2.copy()
df3
df3["c6"] = np.NaN
df3["c7"] = np.NaN
df3["c8"] = np.NaN
df3["c9"] = np.NaN
df3.c1 = np.NaN

In [85]:
df3

   c1    c2    c3    c4  c5  c6  c7  c8  c9
a NaN   1.0   2.0  20.0 NaN NaN NaN NaN NaN
b NaN   4.0   5.0   NaN NaN NaN NaN NaN NaN
c NaN   7.0   8.0   NaN NaN NaN NaN NaN NaN
d NaN  10.0  11.0   NaN NaN NaN NaN NaN NaN
e NaN  13.0  14.0   NaN NaN NaN NaN NaN NaN
f NaN  16.0  17.0  18.0 NaN NaN NaN NaN NaN
g NaN   NaN   0.0   NaN NaN NaN NaN NaN NaN

In [84]:
# only fills the first two NaN's in each row with 0
fill_2 = df3.fillna(0, limit=2, axis = 'index')
fill_2

    c1    c2    c3    c4   c5   c6   c7   c8   c9
a  0.0   1.0   2.0  20.0  0.0  0.0  0.0  0.0  0.0
b  0.0   4.0   5.0   0.0  0.0  0.0  0.0  0.0  0.0
c  NaN   7.0   8.0   0.0  NaN  NaN  NaN  NaN  NaN
d  NaN  10.0  11.0   NaN  NaN  NaN  NaN  NaN  NaN
e  NaN  13.0  14.0   NaN  NaN  NaN  NaN  NaN  NaN
f  NaN  16.0  17.0  18.0  NaN  NaN  NaN  NaN  NaN
g  NaN   0.0   0.0   NaN  NaN  NaN  NaN  NaN  NaN

In [203]:
# only fills the first two NaN's in each row with 0
fill_3 = df3.fillna(1, limit=5)
print(fill_3)
fill_3


    c1    c2    c3    c4   c5   c6   c7   c8   c9
a  1.0   1.0   2.0   2.0  1.0  1.0  1.0  1.0  1.0
b  1.0   4.0   5.0   1.0  1.0  1.0  1.0  1.0  1.0
c  1.0   7.0   8.0   1.0  1.0  1.0  1.0  1.0  1.0
d  1.0  10.0  11.0   1.0  1.0  1.0  1.0  1.0  1.0
e  1.0  13.0  14.0   1.0  1.0  1.0  1.0  1.0  1.0
f  NaN  16.0  17.0  18.0  NaN  NaN  NaN  NaN  NaN
g  NaN   1.0   0.0   1.0  NaN  NaN  NaN  NaN  NaN


    c1    c2    c3    c4   c5   c6   c7   c8   c9
a  1.0   1.0   2.0   2.0  1.0  1.0  1.0  1.0  1.0
b  1.0   4.0   5.0   1.0  1.0  1.0  1.0  1.0  1.0
c  1.0   7.0   8.0   1.0  1.0  1.0  1.0  1.0  1.0
d  1.0  10.0  11.0   1.0  1.0  1.0  1.0  1.0  1.0
e  1.0  13.0  14.0   1.0  1.0  1.0  1.0  1.0  1.0
f  NaN  16.0  17.0  18.0  NaN  NaN  NaN  NaN  NaN
g  NaN   1.0   0.0   1.0  NaN  NaN  NaN  NaN  NaN

In [None]:
df3

## Filling in missing data

In [86]:
df.c4['f'] = np.nan

In [87]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     1.0
f     NaN
g     NaN
Name: c4, dtype: float64

In [88]:
df

    c1    c2    c3    c4   c5  c6
a  0.0   1.0   2.0  20.0  NaN NaN
b  3.0   4.0   5.0   NaN  NaN NaN
c  NaN   7.0   8.0   NaN  NaN NaN
d  NaN   NaN   NaN   NaN  NaN NaN
e  NaN   1.0   1.0   1.0  1.0 NaN
f  NaN  16.0  17.0   NaN  NaN NaN
g  NaN   NaN   NaN   NaN  NaN NaN

In [89]:
df.fillna(method="ffill")

    c1    c2    c3    c4   c5  c6
a  0.0   1.0   2.0  20.0  NaN NaN
b  3.0   4.0   5.0  20.0  NaN NaN
c  3.0   7.0   8.0  20.0  NaN NaN
d  3.0   7.0   8.0  20.0  NaN NaN
e  3.0   1.0   1.0   1.0  1.0 NaN
f  3.0  16.0  17.0   1.0  1.0 NaN
g  3.0  16.0  17.0   1.0  1.0 NaN

In [91]:
# extract the c4 column and fill NaNs forward
df.c5['a'] =3.0
print(df)
df.c4.fillna(method="ffill")

    c1    c2    c3    c4   c5  c6
a  0.0   1.0   2.0  20.0  3.0 NaN
b  3.0   4.0   5.0   NaN  NaN NaN
c  NaN   7.0   8.0   NaN  NaN NaN
d  NaN   NaN   NaN   NaN  NaN NaN
e  NaN   1.0   1.0   1.0  1.0 NaN
f  NaN  16.0  17.0   NaN  NaN NaN
g  NaN   NaN   NaN   NaN  NaN NaN


a    20.0
b    20.0
c    20.0
d    20.0
e     1.0
f     1.0
g     1.0
Name: c4, dtype: float64

In [93]:
df.fillna(method="bfill")

    c1    c2    c3    c4   c5  c6
a  0.0   1.0   2.0  20.0  3.0 NaN
b  3.0   4.0   5.0   1.0  1.0 NaN
c  NaN   7.0   8.0   1.0  1.0 NaN
d  NaN   1.0   1.0   1.0  1.0 NaN
e  NaN   1.0   1.0   1.0  1.0 NaN
f  NaN  16.0  17.0   NaN  NaN NaN
g  NaN   NaN   NaN   NaN  NaN NaN

In [92]:
# perform a backwards fill
df.c4.fillna(method="bfill")

a    20.0
b     1.0
c     1.0
d     1.0
e     1.0
f     NaN
g     NaN
Name: c4, dtype: float64

# Visualizing Data

#### Data Frame Plot
<a href="https://pandas.pydata.org/pandas-docs/stable/reference/frame.html#plotting">Generating Various charts directly from DataFrame</a>