#### Installing Pandas
pip install pandas 

### Reference Link for pandas
<a href="https://pandas.pydata.org/pandas-docs/stable/reference/index.html">API reference Pandas</a>

# Importing pandas into your application

In [331]:
# import numpy and pandas, and DataFrame / Series
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

# Set some pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# And some items for matplotlib
#%matplotlib inline 
#import matplotlib.pyplot as plt
#pd.options.display.mpl_style = 'default'

# Primary pandas objects

## The pandas Series

One-dimensional ndarray with axis labels (including time series).

In [332]:
# create a four item Series
s = Series([5, 8, 3, 4])
s

0    5
1    8
2    3
3    4
dtype: int64

In [333]:
print(s.array,type(s.array))

<PandasArray>
[5, 8, 3, 4]
Length: 4, dtype: int64 <class 'pandas.core.arrays.numpy_.PandasArray'>


In [334]:
print("Index : ", s.index)

Index :  RangeIndex(start=0, stop=4, step=1)


In [335]:
s.values
print(s.values, type(s.values))

[5 8 3 4] <class 'numpy.ndarray'>


In [336]:
s[1:3]

1    8
2    3
dtype: int64

In [337]:
# return a Series with the row with labels 1 and 3
s[[1,3,0]]

1    8
3    4
0    5
dtype: int64

In [338]:
s = pd.Series([1, 2, 3])
print(s)
s.update(pd.Series([4, 5, 6]))
s

0    1
1    2
2    3
dtype: int64


0    4
1    5
2    6
dtype: int64

In [339]:
# create a series using an explicit index
s = Series([5, 8, 3, 4], index = ['a', 'b', 'c', 'd'])
s

a    5
b    8
c    3
d    4
dtype: int64

In [340]:
s.update(Series([0, 2], index=['d', 'e']))
s

a    5
b    8
c    3
d    0
dtype: int64

In [341]:
a= s['a':'d']
print(a)
a['a'] = 20
print(s)

a    5
b    8
c    3
d    0
dtype: int64
a    20
b     8
c     3
d     0
dtype: int64


In [342]:
# look up items the series having index 'a' and 'd'
a = s[['b', 'd']]
print(a)
a['b'] = 1
s

b    8
d    0
dtype: int64


a    20
b     8
c     3
d     0
dtype: int64

In [343]:
# passing a list of integers to a Series that has
# non-integer index labels will look up based upon
# 0-based index like an array
s[[1, 3]]

b    8
d    0
dtype: int64

In [344]:
s.memory_usage()

224

In [345]:
# get only the index of the Series
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [346]:
a = s[['b', 'd']]
a['d'] =np.nan
a.hasnans

True

In [347]:
# create a Series who's index is a series of dates
# between the two specified dates (inclusive)
dates1 = pd.date_range('2021-03-01', '2021-03-06')
dates2 = pd.date_range('2021-03-02', '2021-03-07')
print(dates1)
print(dates2)

# create a Series with values(representing temperatures)
# for each date in the index
temps1 = Series([80, 82, 85, 90, 83, 87], 
                index = dates1)
print(temps1)
print(temps1.index)


DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05',
               '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')
2021-03-01    80
2021-03-02    82
2021-03-03    85
2021-03-04    90
2021-03-05    83
2021-03-06    87
Freq: D, dtype: int64
DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq='D')


In [348]:
# calculate the mean of the values in the Series
temps1.mean()

84.5

In [349]:
temps1.max()

90

In [350]:
temps1.min()

80

In [351]:
temps1.sum()

507

In [352]:
temps1.cumsum()

2021-03-01     80
2021-03-02    162
2021-03-03    247
2021-03-04    337
2021-03-05    420
2021-03-06    507
Freq: D, dtype: int64

In [353]:
# create a second series of values using the same index
temps2 = Series([70, 75, 69, 83, 79, 77], 
                index = dates2)
print(temps2)
print(temps2.index)

2021-03-02    70
2021-03-03    75
2021-03-04    69
2021-03-05    83
2021-03-06    79
2021-03-07    77
Freq: D, dtype: int64
DatetimeIndex(['2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05',
               '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')


In [354]:
print(temps1)

2021-03-01    80
2021-03-02    82
2021-03-03    85
2021-03-04    90
2021-03-05    83
2021-03-06    87
Freq: D, dtype: int64


In [355]:
print(temps2)

2021-03-02    70
2021-03-03    75
2021-03-04    69
2021-03-05    83
2021-03-06    79
2021-03-07    77
Freq: D, dtype: int64


In [356]:

# the following aligns the two by their index values
# and calculates the difference at those matching labels
temp_diffs = temps1 - temps2
print(type(temp_diffs))
temp_diffs
print(temp_diffs, type(temp_diffs))
print("###################################")
print(temp_diffs.index)

<class 'pandas.core.series.Series'>
2021-03-01     NaN
2021-03-02    12.0
2021-03-03    10.0
2021-03-04    21.0
2021-03-05     0.0
2021-03-06     8.0
2021-03-07     NaN
Freq: D, dtype: float64 <class 'pandas.core.series.Series'>
###################################
DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06', '2021-03-07'],
              dtype='datetime64[ns]', freq='D')


In [357]:
temp_diffs.hasnans

True

In [358]:
# lookup a value by date using the index
temp_diffs['2021-03-03']

10.0

In [359]:
# and also possible by integer position as if the 
# series was an array
temp_diffs[3]

21.0

In [360]:
dates = pd.Series(pd.date_range("2022-03-14", periods=10))
print(dates)

0   2022-03-14
1   2022-03-15
2   2022-03-16
3   2022-03-17
4   2022-03-18
5   2022-03-19
6   2022-03-20
7   2022-03-21
8   2022-03-22
9   2022-03-23
dtype: datetime64[ns]


In [361]:
#dates = pd.Series(pd.date_range("2022-03-14", periods=10, freq='Y'))
date_range = pd.date_range("2022-03-14", "2030-03-24", freq='Y')
dates = Series(date_range)
print(dates.index)
print("#####################################")
print("dates :", dates)
print("#####################################")
print(date_range)
print(dates.index)
print("#####################################")
print(dates.dt.is_leap_year)
dates.dt.is_year_end

RangeIndex(start=0, stop=8, step=1)
#####################################
dates : 0   2022-12-31
1   2023-12-31
2   2024-12-31
3   2025-12-31
4   2026-12-31
5   2027-12-31
6   2028-12-31
7   2029-12-31
dtype: datetime64[ns]
#####################################
DatetimeIndex(['2022-12-31', '2023-12-31', '2024-12-31', '2025-12-31',
               '2026-12-31', '2027-12-31', '2028-12-31', '2029-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')
RangeIndex(start=0, stop=8, step=1)
#####################################
0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool


0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
dtype: bool

In [362]:
dates = Series(pd.date_range('2022-12-28',periods = 10))
print(dates.dt.is_year_start)
print(dates.dt.is_year_end)
print(dates.dt.is_quarter_start)
print(dates.dt.is_quarter_end)

0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9    False
dtype: bool
0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool
0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9    False
dtype: bool
0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool


In [363]:
s = pd.date_range('2022-03-06', '2022-03-15', freq='D').to_series()
print(s.dt.weekday)
print("###############################")
print(s.dt.day_name())
print("###############################")
print(s.dt.dayofweek)

2022-03-06    6
2022-03-07    0
2022-03-08    1
2022-03-09    2
2022-03-10    3
2022-03-11    4
2022-03-12    5
2022-03-13    6
2022-03-14    0
2022-03-15    1
Freq: D, dtype: int64
###############################
2022-03-06       Sunday
2022-03-07       Monday
2022-03-08      Tuesday
2022-03-09    Wednesday
2022-03-10     Thursday
2022-03-11       Friday
2022-03-12     Saturday
2022-03-13       Sunday
2022-03-14       Monday
2022-03-15      Tuesday
Freq: D, dtype: object
###############################
2022-03-06    6
2022-03-07    0
2022-03-08    1
2022-03-09    2
2022-03-10    3
2022-03-11    4
2022-03-12    5
2022-03-13    6
2022-03-14    0
2022-03-15    1
Freq: D, dtype: int64


In [364]:
temps1
temps1.index
temps1.index.dt.is_leap_year

AttributeError: 'DatetimeIndex' object has no attribute 'dt'

In [None]:
s = pd.Series(range(5))
print(s)
print(s.where(s>2))
s>2


In [None]:
s = pd.Series(range(1,6))
print(s)
print(s.mask(s>2))
print(s.mask(s > 1, 10))

In [None]:
df = pd.DataFrame([('falcon', 'bird', 389.0),
                   ('parrot', 'bird', 24.0),
                   ('lion', 'mammal', 80.5),
                   ('monkey', 'mammal', np.nan)],
                  columns=['name', 'class', 'max_speed'],
                  index=[0, 2, 3, 1])

In [None]:
print(df.take([0, 2],axis=1))

In [None]:
copy_data = df.take([0,2],is_copy=False,axis=0)
print(df.index)
print(df)
print(copy_data.index)
print("#####################################")


#['name', 'class', 'max_speed']
print(copy_data)

In [None]:
copy_data = df.take([0,1,2],is_copy=True,axis=1)
print(df.index)
print(df)
print(copy_data.index)
print("#####################################")


In [None]:
copy_data['species']

In [None]:
print(copy_data[['name', 'species']])

In [None]:
print(copy_data[['name','species']][0:3])
print(copy_data['max_speed'][[0,2]])
print(copy_data['name'][0:3])

In [None]:
print(df.name)
print(df.classname)
print(df.max_speed)
print("###########################")
print(df.name,df.classname,df.max_speed)

In [None]:
copy_data['max_speed'][0:4] 

In [None]:
copy_data['max_speed'][0:3] = [100,200,200]

In [None]:
copy_data['max_speed'][[2,3,1]] = Series([200,300,500])

In [None]:
df

## The pandas DataFrame

In [None]:
# create a DataFrame from the two series objects temp1 and temp2
# and give them column names
temps_df = DataFrame(
            {'Missoula': temps1, 
             'Philadelphia': temps2})
print(type(temps_df))
temps_df

In [None]:
temps_df.describe()

In [None]:
temps_df.info()

In [None]:
# get the column with the name Missoula
temps_df['Missoula']


In [None]:
# likewise we can get just the Philadelphia column
print(type(temps_df['Philadelphia']))
temps_df['Philadelphia']

In [None]:
# return both columns in a different order
x = temps_df[['Philadelphia']]
print(type(x))
print(x)

In [None]:
x = temps_df[['Philadelphia','Missoula']]
print(x)

In [None]:
x = temps_df['Missoula']
print(type(x))
print(x)

In [None]:
print(temps_df.Missoula)

In [None]:
print(temps_df.Missoula['2021-03-03'])
print("###########################################")
print(temps_df.Missoula['2021-03-03':'2021-03-05'])
print("###########################################")
print(temps_df.Missoula.index)
print(type(temps_df.Missoula))
s = temps_df.Missoula
#print(s[['2019-07-03','2019-07-05'], dtype = 'datetime64[ns]'])

In [None]:
# retrieve the Missoula column through property syntax
print(type(temps_df.Missoula))
temps_df.Missoula

In [None]:
# calculate the temperature difference between the two cities
temps_df.Missoula - temps_df.Philadelphia

In [None]:
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temp_diffs
temps_df['sum'] = temps_df.Missoula + temps_df.Philadelphia
print(temps_df)
print(temps_df.index)
temps_df[2:7:2]

In [None]:
# get the columns, which is also an Index object
temps_df.columns
#temps_df.Difference1
#temps_df["Difference1"]

In [None]:
for x in temps_df.columns:
    print(x)
    for vals in temps_df[x]:
        print(vals)

In [None]:
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:6]

In [None]:
print(temps_df)
print("##############")
# get the row at array position 1
print((temps_df.iloc[6]))
print("##############")
temps_df.iloc[6] #particular row index
print("#######4:8#######")
print(temps_df.iloc[4:8]) # from to To Row
#include step
temps_df.iloc[2:7:2] #from to To with step

In [None]:
# get the row at array position 1
print(type(temps_df.iloc[1:6]))
temps_df.iloc[1:6]

In [None]:
temps_df.iloc[1:5:2]

In [None]:
temps_df
#temps_df.loc['2019-07-04':'2019-07-07':2]

In [None]:
# retrieve row by index label using .loc
print(temps_df.loc['2021-03-03'])
temps_df.loc['2021-03-03','sum']

In [None]:
temps_df

In [None]:
temps_df.Difference[[1,3,5]]

In [None]:
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference

In [None]:
temps_df>82

In [None]:
temps_df[temps_df>82]

In [None]:
# which values in the Missoula column are > 82?
temps_df.Missoula > 82

In [None]:
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]

In [None]:
###### provides default 5 values from top
temps_df.head()

In [None]:
###### provides default 5 values from top
temps_df.head(2)

In [None]:
###### provides default 5 values from top
temps_df.tail(4)

# Loading data from files and the web into a DataFrame

In [None]:
# display the contents of test1.csv
# which command to use depends on your OS
!cat data/test1.csv # on non-windows systems
#!type data/test1.csv # on windows systems

###### Accessing Data

### Reading Data From HTML

In [None]:
import pandas as pd
import io

In [None]:
pd.read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, thousands=', ', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True)

In [None]:
import html5lib
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
#pd.read_html()
dfs = pd.read_html(url, match="Bank Name")

In [None]:
table_MN = pd.read_html('https://en.wikipedia.org/wiki/Minnesota', match='Election results from statewide races')
print(table_MN)

table_MN[0].head(15)

### Reading Data From CLIPBOARD

In [None]:
x = pd.read_clipboard()
print(x)
print(type(x))
#print(x.columns)



In [None]:
# read the contents of the file into a DataFrame
import os
if(os.path.exists('../Titanic.csv')):
    df = pd.read_csv('../Titanic.csv')
    print(df)
else:
    print("File not exists")

In [None]:
# read the contents of the file into a DataFrame
import os
if(os.path.exists('../NLP/test1.csv')):
    df = pd.read_csv('../NLP/test1.csv')
    print(df)
else:
    print("File not exists")

In [None]:
df.columns

In [None]:
# the contents of the date column
df[["date","2"]]

In [None]:
# we can get the first value in the date column
print(type(df.date[0]))
df.date[0]

In [None]:
# it is a string
type(df.date[0])

In [None]:
print(df.date)
type(df.date)


In [None]:
# read the data and tell pandas the date column should be 
# a date in the resulting DataFrame
df = pd.read_csv('../test1.csv', parse_dates=['date'])
print(df.date[0])
type(df.date[0])

In [None]:
df.index

In [None]:
# verify the type now is date
# in pandas, this is actually a Timestamp
type(df.date[0])

In [None]:
# unfortunately the index is numeric which makes
# accessing data by date more complicated
df.index

In [None]:
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv('../test1.csv', parse_dates=['date'], index_col='date')
print(df)
print(df.index)

In [None]:
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv('../test1.csv', parse_dates=['date'], index_col=['date','0'])
print(df)
print(df.index)

In [None]:
df.columns

In [None]:
# use column 0 as the index
msft = pd.read_csv("../msft.csv")
print(msft.head())
print(msft.describe())
print(msft.memory_usage())

In [None]:
msft = pd.read_csv("../msft.csv", index_col=2)
print(msft.head())

In [None]:
msft = pd.read_csv("../msft.csv", index_col=[2,'Low',6])
print(msft.head())

In [None]:
# examine the types of the columns in this DataFrame
msft.dtypes

In [None]:
import numpy as np
# specify that the Volume column should be a float64
msft = pd.read_csv("../msft.csv", parse_dates = ['Date'], dtype = { 'Volume' : np.int32, 'Low': str}, index_col = 'Close')
print(msft.dtypes)
print(msft)


In [None]:
import numpy as np
# specify that the Volume column should be a float64
msft = pd.read_csv("../msft.csv", parse_dates = ['Date'], dtype = { 'Volume' : np.int32, 'Low': str}, index_col = 'Close')
msft_High = msft.High.astype('int32', copy=True, errors='raise')
print(msft)
msft_High

In [None]:
msft = pd.read_csv("../msft.csv", dtype = {'High': str ,'Volume' : np.float64})
print(msft.dtypes)
print("################################################")
msft.Open= msft.Open.astype(np.int64,errors="ignore") # errors ="ignore" or "raise"
msft.Open= msft.Open.astype('int64',errors="ignore") # errors ="ignore" or "raise"
print(msft.Open.head(10))
print(msft.Open.dtypes)
print("################################################")
print(msft.dtypes)
print("################################################")
print(msft.head(20))

In [None]:
msft = msft.convert_dtypes()
msft.dtypes

In [None]:
#to_datetime
#Convert argument to datetime.

#to_timedelta
#Convert argument to timedelta.

#to_numeric
#Convert argument to a numeric type.
print(msft.Low.dtype)
Msft_Low = pd.to_numeric(msft.Low, errors='coerce')
print(Msft_Low)
print(Msft_Low.dtype)
to_timedelta

In [None]:
First_Dates = pd.date_range('2022-03-06', '2022-03-15', freq='D').to_series()
Second_Dates = pd.date_range('2022-03-10', '2022-03-19', freq='D').to_series()
print(First_Dates-Second_Dates)

pd.to_timedelta(First_Dates-Second_Dates)

## Specifying column names

In [None]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=1 skips the header row
df = pd.read_csv("../msft.csv", header=3)
print(df.head(10))
df.columns
#df.columns['2019-07-17']

In [None]:
df = pd.read_csv("../msft.csv",header=3,names=['A', 'B', 'C','D', 'E'])
print(df.head(10))
#print(df.columns)
print(df.index)

print("single index retrieve", df.A['2019-07-16'])
print("single index retrieve", df.A[['2019-07-16','2019-07-14']])
print("multiindex retrieve", df.A[[('2019-07-16', 83.77),('2019-07-14', 83.66)]])



In [None]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=0 skips the header row
df = pd.read_csv("../msft.csv",header=4)
df.head(100)
df

### Specifying specific columns to load

In [None]:
# read in data only in the Date and Close columns
# and index by the Date column
df2 = pd.read_csv("../msft.csv", parse_dates=['Date'],usecols=['Date', 'Close','Low'] , index_col=['Date'])
#df2 = pd.read_csv("../msft.csv", parse_dates=['Date'],usecols=[ 'Close','Low'] , index_col=['Close'])
#df2 = pd.read_csv("../msft.csv",usecols=[ 'Close','Low'] , index_col=['Close'])
df2.head(5)

#names=['A', 'B', 'C','D', 'E']

In [None]:
# read in data only in the Date and Close columns
# and index by the Date column
df2 = pd.read_csv("../msft.csv", parse_dates=['A'],usecols=['A', 'B','C'] ,header =1, names=['A', 'B', 'C','D', 'E', 'F','G'], index_col=['A'])
df2.head(5)

#names=['A', 'B', 'C','D', 'E']

In [None]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

## Saving a DataFrame to a CSV

In [None]:
# save df2 to a new csv file
# also specify naming the index as date
df2.to_excel("myfile1.xls",  index_label='A')
#df2.to_excel("myfile2.xls", sheet_name='mysheet1', index_label='date')
df2.to_excel("myfile3.xls", sheet_name='mysheet2', index_label='A')

df2.to_csv("msft_modified.csv", index_label='A')

#### same file with different sheets

In [None]:
with pd.ExcelWriter('myfile.xls') as writer:  # doctest: +SKIP
    df2.to_excel(writer, sheet_name='Sheet_name_1')
    df2.to_excel(writer, sheet_name='Sheet_name_2')

In [None]:
# view the start of the file just saved
!head data/msft_modified.csv
#type data/msft_modified.csv # windows

# Working with missing data

## Setup

In [366]:
import pandas as pd
import numpy as np
print(np.arange(0, 15).reshape(5, 3))
# create a DataFrame with 5 rows and 3 columns
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), 
               index=['a', 'b', 'c', 'd', 'e'], 
               columns=['c1', 'c2', 'c3'])
df

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]]


   c1  c2  c3
a   0   1   2
b   3   4   5
c   6   7   8
d   9  10  11
e  12  13  14

In [367]:
# add some columns and rows to the DataFrame
# column c4 with NaN values
df['c4'] = np.nan
df

   c1  c2  c3  c4
a   0   1   2 NaN
b   3   4   5 NaN
c   6   7   8 NaN
d   9  10  11 NaN
e  12  13  14 NaN

In [368]:
# row 'f' with 15 through 18 
df.loc['f'] = np.arange(15, 19) 
# row 'g' will all NaN
df.loc['g'] = np.arange(19, 23)
df.iloc[6] = np.nan
print(df)

     c1    c2    c3    c4
a   0.0   1.0   2.0   NaN
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN


In [None]:
df.loc['g'] = np.nan
# column 'C5' with NaN's
df

In [369]:
df['c5'] = np.nan
# change value in col 'c4' row 'a'
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [370]:
df['c4']['a'] = 20
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [371]:
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

## Determining NaN values in Series and DataFrame objects

In [None]:
# which items are NaN?
print(df)
df.isnull()

In [None]:
# total count of NaN values
nullsumforcolumns = df.isnull().sum()#.sum()
nullsumforcolumns

In [None]:
nullsumforcolumns.sum()

In [None]:
# number of non-NaN values in each column
#df.count()
df.notnull()

In [None]:
# which items are not null?
df.notnull().sum()

In [None]:
# which items are not null?
df.notnull().sum().sum()

In [None]:
# which items are not null?
df.count().sum()

# Selecting out (dropping) missing data

In [None]:
df

In [None]:
df.notnull()

In [None]:
df.c3.notnull()

In [None]:
df.c3[df.c3.notnull()]

In [None]:
df[df.c3.notnull()]

In [None]:
df.c3.isnull()

In [None]:
df.c3[df.c3.isnull()]

In [None]:
df[df.c3.isnull()]

##### expression on dataframe applied to the dataframe itself

In [None]:
df.notnull()

In [None]:
df[df.notnull()]

In [None]:
df

In [None]:
df.isnull()

In [None]:
df[df.isnull()]

In [None]:
df.c4

In [None]:
df.c4.notnull()

In [None]:
# select the non-NaN items in column c4
df.c4[df.c4.notnull()]

In [None]:
df[df.c4.notnull()]

In [None]:
df.c4

In [None]:
# .dropna will also return non NaN values
# this gets all non NaN items in column c4
df.c4.dropna()

In [None]:
df

In [None]:
df['c5']['a'] = 1
df.dropna()

In [None]:
df['c1']['g'] = 1

In [None]:
df

In [None]:
df['c4']['a'] = 2

In [None]:
df.dropna()
### all the columns should be not null for a row 
## if one colum value is null or Nan for a row, dropna method will drop that row

In [None]:
# dropna returns a copy with the values dropped
# the source DataFrame / column is not changed
df.c4

In [None]:
# using how='all', only rows that have all values
# as NaN will be dropped
a = df.dropna(how = 'any')#default
print(a)
a = df.dropna(how = 'all')
print(a) 


In [None]:
df['c5'][0] = np.nan

In [None]:
df

In [None]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=1) # say goodbye to c5, axis = 1 column or "columns",
#axis = 0 for rows or "rows" 
#df.dropna(how='all', axis="rows") 

In [None]:
df

In [None]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=0) # say goodbye to g
#df

In [None]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=1) # say goodbye to g
#df

In [None]:
# flip to drop columns instead of rows
df.dropna(how='any', axis=1) # say goodbye to g
#df

In [None]:
# make a copy of df
df2 = df.copy()

# replace two NaN cells with values
df2.loc['g'].c1 = 0
df2.loc['g'].c3 = 0

#.ix is deprecated. Please use
#.loc for label based indexing or loc -> label a, b, c, 
#.iloc for positional indexing iloc -> index of labe a -> 0, b -> 1 -- 0,1,2,3...
df2

In [None]:
# now drop columns with any NaN values
df2.dropna(how='any', axis=0) 

In [None]:
df

In [None]:
#threshold is minimum not nan values
df.dropna(thresh=4, axis='rows')#'rows' or 'columns' can be given for axis
#df
#(df.dropna(thresh=4, axis='rows')).iloc[[0]].c4
#(df.dropna(thresh=4, axis='rows')).c4[1]

In [None]:
df.dropna(thresh=4, axis=0)

In [None]:
df.dropna(thresh=3, axis=1)

In [None]:
# only drop columns with at least 5 NaN values
#threshold is minimum not nan values

df.dropna(thresh=6, axis=1)

In [None]:
df

In [None]:
# NaN's don't count as an item in calculating
# the means
df.mean(axis = 0)# 0 for rows, 1 for column
#df.mean(axis = "rows")# 0 for rows, 1 for column

In [None]:
df.mean(axis = 1)# 0 for rows, 1 for column

In [None]:
df.mean()

In [None]:
df.mean().mean()

In [None]:
df

In [None]:
df.median()

In [None]:
median1 =df.median(axis=0)
print(median1,type(median1))
median1.median()

#### Filling Nan

In [None]:
# return a new DataFrame with NaN's filled with 0
filled = df.fillna(0.0)
filled


In [None]:
# having replaced NaN with 0 can make
# operations such as mean have different results
filled.mean(axis="rows")#consider all the rows for each column

In [None]:
df

In [None]:
#axis not supported -- works only based on column
# only fills the first two NaN's in each row with 0
print(df)
print("####################################")
fill_2 = df.fillna(1.0, limit=1, axis =0)#0)rows or index 
print(fill_2)

In [None]:
df

In [None]:
#DataFrame.replace(self, to_replace=None, value=None, inplace=False, limit=None, regex=False, method='pad')
#df_regex = df.replace("[0-9\.]+",1111.0,limit=3, regex = True)
df3 = df.copy()
df3['c1'] = 0.0
df3['c4']['e'] = 0.0
df3['c4']['c'] = 0.0
df3['c4']['a'] = 0.0
print(df3)

In [None]:
df3

In [None]:
df_regex = df3.replace(0.0,11.0, regex = False)#replace works over rows
print(df_regex)
#DataFrame.replace(df2,1.0,10.0)

In [None]:
df_new = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
print(df_new.replace(to_replace=r'^ba.*$', value='new', regex=True))
print(df_new.replace(to_replace=r'^ba.*t$', value='new', regex=True))

In [None]:
df

In [None]:
# only fills the first two NaN's in each row with 0
df.loc['e'] = 1
df['c1'][['c','d','e','f']] = np.nan
df['c6'] = np.nan

df.loc['d'] = np.nan
print(df)


In [None]:
print("#######################################")#axis not influence
fill_2 = df.fillna(10, limit=3, axis = 'columns') # index or 0, # columns or 1
print(fill_2)


In [None]:
# only fills the first two NaN's in each row with 0
fill_2 = df.fillna("#", limit=5)
print(df)
fill_2


In [None]:
df3 = df2.copy()
df3
df3["c6"] = np.NaN
df3["c7"] = np.NaN
df3["c8"] = np.NaN
df3["c9"] = np.NaN
df3.c1 = np.NaN

In [None]:
df3

In [None]:
# only fills the first two NaN's in each row with 0
fill_2 = df3.fillna(0, limit=2, axis = 'index')
fill_2

In [None]:
# only fills the first two NaN's in each row with 0
fill_3 = df3.fillna(1, limit=5)
print(fill_3)
fill_3


In [None]:
df3

## Filling in missing data

In [None]:
df.c4['f'] = np.nan

In [None]:
df.c4

In [None]:
df

In [None]:
df.fillna(method="ffill")

In [None]:
# extract the c4 column and fill NaNs forward
df.c5['a'] =3.0
print(df)
df.c4.fillna(method="ffill")

In [None]:
df.fillna(method="ffill")

In [365]:
#df.loc['g']['c6'] = 16
#df.iloc[6]['c6'] = 16
#df['c6']['g'] = 16
#df[['c6','c5']]['g'] = [16,18]
df[('c6','g')]
df

    c1    c2    c3   c4   c5    c6  (c6, g)
a  0.0   1.0   2.0  2.0  3.0   NaN       18
b  3.0   4.0   5.0  NaN  NaN   NaN       18
c  NaN   7.0   8.0  NaN  NaN   NaN       18
d  NaN   NaN   NaN  NaN  NaN   NaN       18
e  NaN   1.0   1.0  1.0  1.0   NaN       18
f  NaN  16.0  17.0  NaN  NaN   NaN       18
g  NaN   NaN   NaN  NaN  NaN  16.0       18

In [373]:
df.fillna(method="bfill")

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  20.0 NaN
b   3.0   4.0   5.0  18.0 NaN
c   6.0   7.0   8.0  18.0 NaN
d   9.0  10.0  11.0  18.0 NaN
e  12.0  13.0  14.0  18.0 NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [374]:
# perform a backwards fill
df.c4.fillna(method="bfill")

a    20.0
b    18.0
c    18.0
d    18.0
e    18.0
f    18.0
g     NaN
Name: c4, dtype: float64

# Visualizing Data

#### Data Frame Plot
<a href="https://pandas.pydata.org/pandas-docs/stable/reference/frame.html#plotting">Generating Various charts directly from DataFrame</a>