In [1]:
import pandas as pd
import pandera as pa

# Pandas core components
* Series types
* DataFrame types

In [2]:
l1 : list[int] = [1,2,3,4,5,6,7,8,9]
l1

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [3]:
# list we can use for creating series

s1 : pd.Series = pd.Series([1,2,3,4,5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
# Set we can use for creating series

s1 : pd.Series = pd.Series({1,2,3,4,5})
s1

TypeError: 'set' type is unordered

In [5]:
# Tuple we can use for creating series

s1 : pd.Series = pd.Series((1,2,3,4,5))
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
# Dictionary we can use for creating series

s1 : pd.Series = pd.Series({"a": 10,
                            "b":20,
                            "c":30,
                            "d":40,
                            "e":50,
                            "f":60,
                            "g":70,})
s1

a    10
b    20
c    30
d    40
e    50
f    60
g    70
dtype: int64

In [7]:
values : list[int] = [1, 2, 3, 4, 5]
index1 : list[str] = ['a', 'b', 'c', 'd', 'e']

s1 : pd.Series = pd.Series(values, index=index1)
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [8]:
values : list[int] = [1, 2, 3, 4, 5]

index1 : list[list[str]] = [['a1', 'a1', 'a1', 'b1', 'b1'],
                            ['a', 'b', 'c', 'd', 'e']]

s1 : pd.Series = pd.Series(values, index=index1)
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
dtype: int64

In [9]:
values : list[int] = [1, 2, 3, 4, 5]

index1 : list[list[str]] = [['a1', 'a1', 'a1', 'b1', 'b1'],
                            ['a', 'b', 'c', 'd', 'e']]

s1 : pd.Series = pd.Series(values, index=index1, name="Student_Data")
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
Name: Student_Data, dtype: int64

In [10]:
import numpy as np
values : list[int] = [1, 2, 3, 4, 5]

index1 : list[list[str]] = [['a1', 'a1', 'a1', 'b1', 'b1'],
                            ['a', 'b', 'c', 'd', 'e']]

s1 : pd.Series = pd.Series(values, 
                           index=index1, 
                           name="Student_Data",
                           dtype=np.int32)
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
Name: Student_Data, dtype: int32

In [11]:
from nptyping import DataFrame, Structure as S

s1 : S["Str"] = pd.Series(['a', 'b', 'c'])

InvalidStructureError: 'Str' is not a valid structure expression.

In [12]:
import pandas as pd
import pandera as pa

# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema(df)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


## DataFrame


In [13]:
s1 : pd.Series = pd.Series([1,2,3,4], name="student id")
s2 : pd.Series = pd.Series([10,20,30,40], name="score")
s3 : pd.Series = pd.Series(["John","Robert","David","Micheal"], name="student name")


df1 : pd.DataFrame = pd.DataFrame({"student id":s1, "score":s2, "student name":s3})
df1

Unnamed: 0,student id,score,student name
0,1,10,John
1,2,20,Robert
2,3,30,David
3,4,40,Micheal


In [15]:
s1 : pd.Series = pd.Series([1,2,3,4], name="student id")
s2 : pd.Series = pd.Series([10,20,30,40], name="score")
s3 : pd.Series = pd.Series(["John","Robert","David","Micheal"], name="student name")

# key
# value: iterable
# lenght should be same as lenght of others series
df1 : pd.DataFrame = pd.DataFrame({"student id":s1, "score":s2, "student name":s3})
df1

Unnamed: 0,student id,score,student name
0,1,10,John
1,2,20,Robert
2,3,30,David
3,4,40,Micheal


In [16]:
s1 : pd.Series = pd.Series([1,2,3,4], name="student id")
s2 : pd.Series = pd.Series([10,20,30,40], name="score")
s3 : pd.Series = pd.Series(["John","Robert","David","Micheal"], name="student name")



df1 : pd.DataFrame = pd.DataFrame([s1, s2, s3])
df1

Unnamed: 0,0,1,2,3
student id,1,2,3,4
score,10,20,30,40
student name,John,Robert,David,Micheal


In [17]:
s1 : pd.Series = pd.Series([1,2,3,4], name="student id")
s2 : pd.Series = pd.Series([10,20,30,40], name="score")
s3 : pd.Series = pd.Series(["John","Robert","David","Micheal"], name="student name")


df1 : pd.DataFrame = pd.concat([s1, s2, s3], axis=1)
df1

Unnamed: 0,student id,score,student name
0,1,10,John
1,2,20,Robert
2,3,30,David
3,4,40,Micheal


In [18]:
data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]

df : pd.DataFrame = pd.DataFrame(data)

df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [19]:
data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]

df : pd.DataFrame = pd.DataFrame(data, columns=['A',"B","C"])

df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [20]:
data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]

df : pd.DataFrame = pd.DataFrame(data, columns=['A',"B","C"], index=['x','y','z'])

df

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [21]:
df.columns


Index(['A', 'B', 'C'], dtype='object')

In [22]:
df.index


Index(['x', 'y', 'z'], dtype='object')

In [23]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

https://pandera.readthedocs.io/en/stable/index.html

In [28]:
from nptyping import NDArray, Shape, UInt64
from typing import Any


data : NDArray[Shape["10, 10"], Any] = np.arange(10*10).reshape(10, 10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [29]:
from nptyping import NDArray, Shape, UInt64
from typing import Any


data : NDArray[Shape["10, 10"], Any] = np.arange(10*10).reshape(10, 10)

df : pd.DataFrame = pd.DataFrame(data, columns=list("ABCDEFGHIJ"))
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [30]:
dfl : list[pd.DataFrame] = pd.read_html("https://www.w3schools.com/python/python_operators.asp")
dfl

[  Operator            Name Example    Try it
 0        +        Addition   x + y  Try it »
 1        -     Subtraction   x - y  Try it »
 2        *  Multiplication   x * y  Try it »
 3        /        Division   x / y  Try it »
 4        %         Modulus   x % y  Try it »
 5       **  Exponentiation  x ** y  Try it »
 6       //  Floor division  x // y  Try it »,
    Operator  Example     Same As    Try it
 0         =    x = 5       x = 5  Try it »
 1        +=   x += 3   x = x + 3  Try it »
 2        -=   x -= 3   x = x - 3  Try it »
 3        *=   x *= 3   x = x * 3  Try it »
 4        /=   x /= 3   x = x / 3  Try it »
 5        %=   x %= 3   x = x % 3  Try it »
 6       //=  x //= 3  x = x // 3  Try it »
 7       **=  x **= 3  x = x ** 3  Try it »
 8        &=   x &= 3   x = x & 3  Try it »
 9        |=   x |= 3   x = x | 3  Try it »
 10       ^=   x ^= 3   x = x ^ 3  Try it »
 11      >>=  x >>= 3  x = x >> 3  Try it »
 12      <<=  x <<= 3  x = x << 3  Try it »,
   Operator   

In [31]:
dfl[0]


Unnamed: 0,Operator,Name,Example,Try it
0,+,Addition,x + y,Try it »
1,-,Subtraction,x - y,Try it »
2,*,Multiplication,x * y,Try it »
3,/,Division,x / y,Try it »
4,%,Modulus,x % y,Try it »
5,**,Exponentiation,x ** y,Try it »
6,//,Floor division,x // y,Try it »


In [32]:
df : pd.DataFrame = pd.read_json("https://www.w3schools.com/python/pandas/data.js")
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.4
166,60,115,145,310.2
167,75,120,150,320.4


In [33]:
import pandas as pd
import pandera as pa

# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema(df)
print(validated_df)


   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


In [34]:
dir(pa.Check)


['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_check_fn_code',
 'between',
 'eq',
 'equal_to',
 'equal_to',
 'from_builtin_check_name',
 'ge',
 'get_backend',
 'get_builtin_check_fn',
 'greater_than',
 'greater_than',
 'greater_than_or_equal_to',
 'greater_than_or_equal_to',
 'gt',
 'in_range',
 'in_range',
 'isin',
 'isin',
 'le',
 'less_than',
 'less_than',
 'less_than_or_equal_to',
 'less_than_or_equal_to',
 'lt',
 'ne',
 'not_equal_to',
 'not_equal_to',
 'notin',
 'notin',
 'one_sample_ttest',
 'register_backend',
 'register_builtin_check_fn',
 'str_contains',
 'str_contains',
 'str_endswith',
 'str_endswith',


# Sicling and indexing
* series_variable[index]
* dataFrame
    * loc
    * iloc
    * at
    * iat

In [35]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
display(s1)
print("Applying sliding")
display(s1[1])# index

0    1
1    2
2    3
3    4
4    5
dtype: int64

Applying sliding


2

In [36]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
display(s1)
print("Applying slicing")
display(s1[1:4])# index

0    1
1    2
2    3
3    4
4    5
dtype: int64

Applying sliding


1    2
2    3
3    4
dtype: int64

In [39]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
display(s1)
print("Applying slicing")
display(s1.iloc[1:4])# index location (numbers) same as numpy silcing

0    1
1    2
2    3
3    4
4    5
dtype: int64

Applying slicing


1    2
2    3
3    4
dtype: int64

In [40]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd','e'])
display(s1)
print("Applying slicing")
display(s1.iloc[1:4])# index location (numbers) same as numpy silcing

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying slicing


b    2
c    3
d    4
dtype: int64

In [41]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd','e'])
display(s1)
print("Applying slicing")
display(s1.loc["a":"d"])# index location (label) end included

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying slicing


a    1
b    2
c    3
d    4
dtype: int64

In [43]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd','e'])
display(s1)
print("Applying slicing")
display(s1.iat[1])# index location (number) extract one cell value and you can update it

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying slicing


2

In [44]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd','e'])
display(s1)
print("Applying slicing")
display(s1.at["d"])# index location (label) extract one cell value and you can update it 

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying slicing


4

https://docs.python.org/3/library/re.html