In [1]:
import numpy as np
import pandas as pd
pd.__version__

'1.2.4'

<p class="lead display-3 alert alert-info">1. Pandas <strong>Objects</strong> can be thought of, numpy arrays with custom structured dtypes.</p>

<h1>Three kinds of Pandas datastructures:</h1>

<h3 class="display-4">Series Objects</h3>
<blockquote>A Pandas Series is a one-dimensional array of indexed data. It can be created from a list or array</blockquote>

<h3 class="display-4">DataFrame Objects</h3>
<blockquote>DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names</blockquote>

<h3 class="display-4">Index Objects</h3>
<blockquote>A Pandas Series is a one-dimensional array of indexed data. It can be created from a list or array as follows:</blockquote>

<h2>Series Objects</h2>

In [2]:
# We can create a series object with pd.Series
# We can pass a few params to it.
my_series = pd.Series(data=np.linspace(1, 2, 5), name="Values b/w 1 to 2")

In [3]:
# Lets explore the above series.

print("The series as it looks:", end="\n\n")
print(my_series, end="\n\n")

print("Values property of the object: All values will be shown as a list")
print(my_series.values, end="\n\n")

print("Index of this series:")
print(my_series.index, end="\n\n")

print("Let's get the first few values using slicing")
print(my_series[1:3])

The series as it looks:

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
Name: Values b/w 1 to 2, dtype: float64

Values property of the object: All values will be shown as a list
[1.   1.25 1.5  1.75 2.  ]

Index of this series:
RangeIndex(start=0, stop=5, step=1)

Let's get the first few values using slicing
1    1.25
2    1.50
Name: Values b/w 1 to 2, dtype: float64


<h3>Above we can note a few things</h3>
<ul class="list-group">
    <li class="list-group-item">If you provide a name too, it shows up. Else it's None.</li>
    <li class="list-group-item">Index by default, starts from 0.</li>
    <li class="list-group-item">We can get values only with the values property.</li>
    <li class="list-group-item">Sliced series will also show indexes</li>
</ul>

<p class="lead display-3 alert alert-info">2. We can have our own index range</p>

<ul class="list-group">
    <li class="list-group-item">It can start with anything like a letter, or a number</li>
    <li class="list-group-item">Unlike python indexing, you can then use these custom indexes</li>
</ul>

In [4]:
series_with_str_index = pd.Series(np.random.random(5), index=['a', 'b', 'c', 'd', 'e'])
print(series_with_str_index.index)

# Slicing
print(series_with_str_index['b':'d'], end="\n\n")

# Get selected indexes using explicit index.
print('using explicit index')
print(series_with_str_index[['b', 'e']], end="\n\n")

print('Get only specific index items with explicit index')
print(series_with_str_index[['a', 'e']], end="\n\n")

# Slice array using python implicit index
print('using implicit python index')
print(series_with_str_index[1:3], end="\n\n")

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
b    0.098687
c    0.086036
d    0.295852
dtype: float64

using explicit index
b    0.098687
e    0.802540
dtype: float64

Get only specific index items with explicit index
a    0.773091
e    0.802540
dtype: float64

using implicit python index
b    0.098687
c    0.086036
dtype: float64



<p class="lead display-3 alert alert-info">3. Series as specialized dictionary.</p>

In [5]:
# Let's create a panda series object out of a python dictionary.

person_dict = {'name':'Jaddu', 'age':40, 'weigth': 68.5}

print("The Series:", end='\n\n')
dict_series = pd.Series(person_dict)
print(dict_series, end='\n\n')

print("Index of this series")
print(dict_series.index, end='\n\n')

print("Slicing:")
print(dict_series['name':'age'], end='\n\n')

print("Note: The index keys are behind-the-scene sorted and used while slicing.")

The Series:

name      Jaddu
age          40
weigth     68.5
dtype: object

Index of this series
Index(['name', 'age', 'weigth'], dtype='object')

Slicing:
name    Jaddu
age        40
dtype: object

Note: The index keys are behind-the-scene sorted and used while slicing.


<h3>We can create Series from one value too</h3>
<p>If we provide an index list too, value will be the same for every index</p>

In [6]:
pd.Series('a')

0    a
dtype: object

In [7]:
pd.Series('a', index=[0,1,2])

0    a
1    a
2    a
dtype: object

<h3>We can get values of only selected indexes (with a dictionary as data)</h3>


In [8]:
pd.Series(person_dict, index=['name', 'age'])

name    Jaddu
age        40
dtype: object

In [9]:
my_series[(my_series>1.5)]

3    1.75
4    2.00
Name: Values b/w 1 to 2, dtype: float64

<h3>Indexers: loc, iloc, and ix</h3>

- **loc** attribute allows indexing and slicing that always references the explicit index
- **iloc** attribute allows indexing and slicing that always references the implicit Python-style index:

In [10]:
series_with_str_index.loc[:]

a    0.773091
b    0.098687
c    0.086036
d    0.295852
e    0.802540
dtype: float64

In [11]:
# Proof that while using loc, yo can ONLY use explicit indexes (as defined while creating the series)
try:
    series_with_str_index.loc[0:2]
except TypeError as em:
    assert str(em) == 'cannot do slice indexing on Index with these indexers [0] of type int'

In [12]:
# So this will work.
series_with_str_index.loc['b':'e']

b    0.098687
c    0.086036
d    0.295852
e    0.802540
dtype: float64

In [13]:
# Proof that iloc only works with python implicit indexes (0,1,2,.....)
try:
    series_with_str_index.iloc['b':'e']
except TypeError as em:
    assert str(em) == 'cannot do positional indexing on Index with these indexers [b] of type str'
    
series_with_str_index.iloc[1:4]

b    0.098687
c    0.086036
d    0.295852
dtype: float64

<h1 class="alert alert-info">Dataframes</h1>

<p> You can say that it is a collection of series with a generalised index for easy access to the data.</p>

In [14]:
# How to create a DataFrame

price_grocery = {'rice': 12, 'eggs':5.5, 'milk': 3, 'coke': 2.99}
category_grocery = {'rice': 'Staple', 'eggs': 'Poultry', 'coke': 'Beverages', 'milk':'Milk Product'}
stock_grocery = {'rice': 100, 'eggs': 200, 'coke': 50, 'milk':10}

price_series = pd.Series(price_grocery)
category_series = pd.Series(category_grocery)
stock_series = pd.Series(stock_grocery)

# Create from a dictionary of series objects.
grocery_df = pd.DataFrame({'unit_price':price_series, 'category':category_series, 'stock':stock_series})
grocery_df

Unnamed: 0,unit_price,category,stock
coke,2.99,Beverages,50
eggs,5.5,Poultry,200
milk,3.0,Milk Product,10
rice,12.0,Staple,100


<blockquote>Thus DataFrame can be thought of as a generalization of a two-dimensional NumPy array, where both the rows and columns have a generalized index for accessing the data.</blockquote>

In [15]:
# Exploring a DataFrame

## Index of a DataFrame
print(grocery_df.index)

## Columns of DataFrame
print(grocery_df.columns)

Index(['coke', 'eggs', 'milk', 'rice'], dtype='object')
Index(['unit_price', 'category', 'stock'], dtype='object')


<blockquote>TIP:</mark> We can also think of a DataFrame as a specialization of a dictionary. Where a dictionary maps a key to a value, a DataFrame maps a column name to a Series of column data</blockquote>

<h3>Accesing Data</h3>

In [16]:
# Dictionary style: With key (your columns are the keys)
grocery_df['stock']

coke     50
eggs    200
milk     10
rice    100
Name: stock, dtype: int64

In [28]:
# As properties
grocery_df.stock

# NOTE: For this to work, indexes should be strings. And they should not conflict with method names.

coke     50
eggs    200
milk     10
rice    100
Name: stock, dtype: int64

<h2 style="color:darkblue">Constructing Dataframes objects</h2>

In [17]:
# We can create from a single series.

pd.DataFrame(price_series, columns=['price'])

Unnamed: 0,price
rice,12.0
eggs,5.5
milk,3.0
coke,2.99


In [18]:
# We can create fron a list of dicts
pd.DataFrame([price_grocery, category_grocery, stock_grocery])

Unnamed: 0,rice,eggs,milk,coke
0,12,5.5,3,2.99
1,Staple,Poultry,Milk Product,Beverages
2,100,200,10,50


In [19]:
# From a two-dimensional NumPy array

pd.DataFrame(np.random.random((3, 2)), columns=['Home', 'Away'], index=['Dev1', 'Dev2', 'Dev3'])

Unnamed: 0,Home,Away
Dev1,0.672154,0.020464
Dev2,0.294023,0.979542
Dev3,0.489253,0.534273


In [20]:
# From a Numpy Structured array (remember custom dtype)

pd.DataFrame(np.ones(3, dtype=[('height','f8'), ('count', 'int8')]))

Unnamed: 0,height,count
0,1.0,1
1,1.0,1
2,1.0,1


<h3>Finally, you can create an index explicity</h3>


In [21]:
my_index = pd.Index([2,5,7,9, 11])

# You cannot change them..immutability
try:
    my_index[2] = 123
except TypeError:
    print("Yup...cant do it")
    
# They have same properties as numpy array
my_index.ndim, my_index.shape, my_index.dtype, my_index.size


Yup...cant do it


(1, (5,), dtype('int64'), 5)

<h2 style="color:darkgreen">Lets see what we can do with a dataframe</h2>

In [22]:
# max() method is available on df.
grocery_df.max()

unit_price      12.0
category      Staple
stock            200
dtype: object

In [23]:
grocery_df.min()

unit_price         2.99
category      Beverages
stock                10
dtype: object

In [24]:
grocery_df.mean()

unit_price     5.8725
stock         90.0000
dtype: float64

<h2 style="color:darkred">Dataframe as a dictionary</h2>

In [25]:
grocery_df.keys()

Index(['unit_price', 'category', 'stock'], dtype='object')

In [26]:
list(grocery_df.items())

[('unit_price',
  coke     2.99
  eggs     5.50
  milk     3.00
  rice    12.00
  Name: unit_price, dtype: float64),
 ('category',
  coke       Beverages
  eggs         Poultry
  milk    Milk Product
  rice          Staple
  Name: category, dtype: object),
 ('stock',
  coke     50
  eggs    200
  milk     10
  rice    100
  Name: stock, dtype: int64)]

In [31]:
# Create a new column from some calculation
# Example: Total stock price of every item.

grocery_df['Stock_Price'] = grocery_df['stock'] * grocery_df['unit_price']
grocery_df

# We will explore this is detail later.

Unnamed: 0,unit_price,category,stock,Stock_Price
coke,2.99,Beverages,50,149.5
eggs,5.5,Poultry,200,1100.0
milk,3.0,Milk Product,10,30.0
rice,12.0,Staple,100,1200.0


<h2 style="color:darkorange">DataFrame as two-dimensional array</h2>

In [34]:
# View the Raw Data as 2d array.
grocery_df.values

# You can use indexing on values
grocery_df.values[1]

array([5.5, 'Poultry', 200, 1100.0], dtype=object)

In [35]:
# We can transpose it since it is a 2d matrix.
grocery_df.T

Unnamed: 0,coke,eggs,milk,rice
unit_price,2.99,5.5,3.0,12.0
category,Beverages,Poultry,Milk Product,Staple
stock,50,200,10,100
Stock_Price,149.5,1100.0,30.0,1200.0


In [44]:
# For array style indexing / slicing.
# While can do, grocery_df[0:3], this is confusing. So once again, we will loc and iloc.

grocery_df.loc[:'milk', :'category']

Unnamed: 0,unit_price,category
coke,2.99,Beverages
eggs,5.5,Poultry
milk,3.0,Milk Product


In [47]:
# Same data using iloc.

grocery_df.iloc[:3, :2]

Unnamed: 0,unit_price,category
coke,2.99,Beverages
eggs,5.5,Poultry
milk,3.0,Milk Product


In [58]:
# We can run some querys too using masks.
grocery_df.loc[grocery_df.stock >= 100]

Unnamed: 0,unit_price,category,stock,Stock_Price
eggs,5.5,Poultry,200,1100.0
rice,12.0,Staple,100,1200.0


In [57]:
# We can limit the columns we want to see too.
grocery_df.loc[grocery_df.stock >= 100, ['stock']]

Unnamed: 0,stock
eggs,200
rice,100
