In [4]:
import pandas as pd
import pandera as pa

# pandas core components
* Series Types
* DataFrames Types

In [10]:
l1 : list[int] = [1,2,3,4,5,6,7,8,9]
l1

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [5]:
# we can use list for creating series

s1 : pd.Series = pd.Series([1, 2, 3, 4, 5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [7]:
# we cannot use set  for creating series

s1 : pd.Series = pd.Series({1, 2, 3, 4, 5})
s1

TypeError: 'set' type is unordered

In [9]:
# we can use set  for creating series

s1 : pd.Series = pd.Series((1, 2, 3, 4, 5))
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [11]:
# we can use Dict for creating series

s1 : pd.Series = pd.Series({"a":10,
                            "b":20,
                            "c":30,
                            "d":40,
                            "e":50,
                            "f":60,
                            "g":70,
                            "h":80,
                            "i":90,
                            "j":100,})
s1

a     10
b     20
c     30
d     40
e     50
f     60
g     70
h     80
i     90
j    100
dtype: int64

In [12]:
values : list[int] = [1, 2, 3, 4, 5]
index1 : list[str] = ['a','b','c','d','e']

s1 : pd.Series = pd.Series(values, index=index1)
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [13]:
values : list[int] = [1, 2, 3, 4, 5]
index1 : list[list[str]] = [['a1','a1','a1','b1','b1'],
                            ['a','b','c','d','e']]

s1 : pd.Series = pd.Series(values, index=index1)
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
dtype: int64

In [14]:
values : list[int] = [1, 2, 3, 4, 5]
index1 : list[list[str]] = [['a1','a1','a1','b1','b1'],
                            ['a','b','c','d','e']]

s1 : pd.Series = pd.Series(values, index=index1,name="Student_Data")
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
Name: Student_Data, dtype: int64

In [17]:
import numpy as np
values : list[int] = [1, 2, 3, 4, 5]

index1 : list[list[str]] = [['a1','a1','a1','b1','b1'],
                            ['a','b','c','d','e']]

s1 : pd.Series = pd.Series(values,
                           index=index1,
                           name="Student_Data",
                           dtype=np.Int32)
s1

AttributeError: module 'numpy' has no attribute 'Int32'

In [18]:
import pandas as pd
import pandera as pa

# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema(df)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


# Dataframe

In [20]:
s1 : pd.Series = pd.Series([1,2,3,4,5],name="student id")
s2 : pd.Series = pd.Series([10,20,30,40,50],name="score")
s3 : pd.Series = pd.Series(["Hamza","Ali","Junaid","Rashid","Konain"],name="student name")

df1 : pd.DataFrame = pd.DataFrame({"student id":s1,"score":s2,"student name":s3})
df1

Unnamed: 0,student id,score,student name
0,1,10,Hamza
1,2,20,Ali
2,3,30,Junaid
3,4,40,Rashid
4,5,50,Konain


In [4]:
s1 : pd.Series = pd.Series([1,2,3,4,5],name="student id")
s2 : pd.Series = pd.Series([10,20,30,40,50],name="score")
s3 : pd.Series = pd.Series(["Hamza","Ali","Junaid","Rashid","Konain"],name="student name")

df1 : pd.DataFrame = pd.concat([s1,s2,s3],axis=1)
df1

Unnamed: 0,student id,score,student name
0,1,10,Hamza
1,2,20,Ali
2,3,30,Junaid
3,4,40,Rashid
4,5,50,Konain


In [5]:

data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]
df : pd.DataFrame = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [7]:

data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]
df : pd.DataFrame = pd.DataFrame(data, columns=["A","B","C"], index=["x","y","z"])
df

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [8]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [9]:
df.index

Index(['x', 'y', 'z'], dtype='object')

In [10]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

In [20]:
from nptyping import NDArray, Shape, UInt32

data : NDArray[Shape["10,10"],UInt32] = np.arange(10*10).reshape(10,10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [24]:
from nptyping import NDArray, Shape, UInt32

data : NDArray[Shape["10,10"],UInt32] = np.arange(10*10).reshape(10,10)
df : pd.DataFrame = pd.DataFrame(data,columns=list("ABCDEFGHIJ"))
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [26]:
%pip install lxml

dfl : list[pd.DataFrame] = pd.read_html("https://www.w3schools.com/python/python_operators.asp")
dfl

Collecting lxml
  Downloading lxml-5.1.0-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Downloading lxml-5.1.0-cp312-cp312-win_amd64.whl (3.9 MB)
   ---------------------------------------- 0.0/3.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.9 MB 330.3 kB/s eta 0:00:12
    --------------------------------------- 0.1/3.9 MB 409.6 kB/s eta 0:00:10
    --------------------------------------- 0.1/3.9 MB 438.1 kB/s eta 0:00:09
    --------------------------------------- 0.1/3.9 MB 438.1 kB/s eta 0:00:09
   - -------------------------------------- 0.1/3.9 MB 450.6 kB/s eta 0:00:09
   - -------------------------------------- 0.1/3.9 MB 405.9 kB/s eta 0:00:10
   - -------------------------------------- 0.1/3.9 MB 405.9 kB/s eta 0:00:10
   - -------------------------------------- 0.2/3.9 MB 393.8 kB/s eta 0:00:10
   - ---------------------

[  Operator            Name Example    Try it
 0        +        Addition   x + y  Try it »
 1        -     Subtraction   x - y  Try it »
 2        *  Multiplication   x * y  Try it »
 3        /        Division   x / y  Try it »
 4        %         Modulus   x % y  Try it »
 5       **  Exponentiation  x ** y  Try it »
 6       //  Floor division  x // y  Try it »,
    Operator  Example     Same As    Try it
 0         =    x = 5       x = 5  Try it »
 1        +=   x += 3   x = x + 3  Try it »
 2        -=   x -= 3   x = x - 3  Try it »
 3        *=   x *= 3   x = x * 3  Try it »
 4        /=   x /= 3   x = x / 3  Try it »
 5        %=   x %= 3   x = x % 3  Try it »
 6       //=  x //= 3  x = x // 3  Try it »
 7       **=  x **= 3  x = x ** 3  Try it »
 8        &=   x &= 3   x = x & 3  Try it »
 9        |=   x |= 3   x = x | 3  Try it »
 10       ^=   x ^= 3   x = x ^ 3  Try it »
 11      >>=  x >>= 3  x = x >> 3  Try it »
 12      <<=  x <<= 3  x = x << 3  Try it »,
   Operator   

In [27]:
dfl[0]

Unnamed: 0,Operator,Name,Example,Try it
0,+,Addition,x + y,Try it »
1,-,Subtraction,x - y,Try it »
2,*,Multiplication,x * y,Try it »
3,/,Division,x / y,Try it »
4,%,Modulus,x % y,Try it »
5,**,Exponentiation,x ** y,Try it »
6,//,Floor division,x // y,Try it »


In [28]:
df : pd.DataFrame = pd.read_json("https://www.w3schools.com/python/pandas/data.js")
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.4
166,60,115,145,310.2
167,75,120,150,320.4


In [29]:
import pandas as pd
import pandera as pa

# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema(df)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


# Slicing and Indexing
* Series_variable[index]
* dataframe
    * loc
    * iloc
    * at
    * iat

In [31]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
display(s1)
print("Applying Slicing")
display(s1[1]) # index

0    1
1    2
2    3
3    4
4    5
dtype: int64

Applying Slicing


2

In [32]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
display(s1)
print("Applying Slicing")
display(s1[1:4]) # index

0    1
1    2
2    3
3    4
4    5
dtype: int64

Applying Slicing


1    2
2    3
3    4
dtype: int64

In [34]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
display(s1)
print("Applying Slicing")
display(s1.iloc[1:4]) # index location always pass numbers same as numpy slicing

0    1
1    2
2    3
3    4
4    5
dtype: int64

Applying Slicing


1    2
2    3
3    4
dtype: int64

In [35]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
display(s1)
print("Applying Slicing")
display(s1.iloc[1:4]) # index location always pass numbers same as numpy slicing

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying Slicing


b    2
c    3
d    4
dtype: int64

In [36]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
display(s1)
print("Applying Slicing")
display(s1.loc["a":"d"]) # index location (label) end included

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying Slicing


a    1
b    2
c    3
d    4
dtype: int64

In [37]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
display(s1)
print("Applying Slicing")
display(s1.iat[1]) # index location (number) extract one cell value and you can update it

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying Slicing


2

In [38]:
s1 : pd.Series = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
display(s1)
print("Applying Slicing")
display(s1.at["d"]) # index location (label) extract one cell value and you can update it

a    1
b    2
c    3
d    4
e    5
dtype: int64

Applying Slicing


4

# Regex
                 time <-    regex       
* Time example 12:45:32   \d{2}: \d{2}: \d{2}
* name example zain         (.*) # any character other than backslash n
* space and minus sign optional     ? ?
* number example 1234   \d{5,6} min length 5 and max length 6

In [43]:
x : str = """
PIAIC	zain	1233
PIAIC	hsge	3455
PIAIC	sjhe	6677
PIAIC	lskje	87665
PIAIC	sjhehe	4433
PIAIC	siugda	676879
PIAIC	ndbdaq	854
PIAIC	pljs	324556
PIAIC	magdguw	4335

"""

import re

patterns : str = """
PIAIC	(.*)  ?	\d{5,6}
"""

data : list[list[str]] = re.findall(patterns, x)
data

  patterns : str = """


[]