# Pulock Das
## Software Engineer @ TallyKhata
### Backend Django Developer

#### Colab Session Link: https://tinyurl.com/pds-09-edge

# What is Pandas?
- Pandas is a Python library used for data manipulation and analysis.
- Core data structures:
  - **Series**: A one-dimensional array with labels.
  - **DataFrame**: A two-dimensional labeled table.
- Key Features:
  - Import/export data (CSV, Excel, JSON, SQL).
  - Data cleaning, filtering, and transformation.
  - Grouping and aggregation.
  - High-performance operations on data.

In [2]:
import numpy as np
import pandas as pd

## Pandas Series

In [3]:
import pandas as pd

labels = ['alif', 'hasan', 'jahid', 'manik']
marks =    [70, 33, 65, 92]
series = pd.Series(marks, index=labels)

print(series)

alif     70
hasan    33
jahid    65
manik    92
dtype: int64


In [4]:
# Accessing and modifying the values in a series

print("\nValue at label 'alif':", series['alif'])

series["manik"] +=5
print("\nUpdated Series:")
print(series)

print("\nValues greater than or equal to 30:\n", series[series>=40])


Value at label 'alif': 70

Updated Series:
alif     70
hasan    33
jahid    65
manik    97
dtype: int64

Values greater than or equal to 30:
 alif     70
jahid    65
manik    97
dtype: int64


### Task 1: Create a `Series` marks for your exam(ban,eng,math) And print the series

In [5]:
# Your code here

subjects = ['Bangla', 'English', 'Mathematics']
marks = [90, 85, 88]

# Create a series with the given subjects and marks
marks_series = pd.Series(marks, index=subjects)

# Print the series
print("\nMarks Series:")
print(marks_series)


Marks Series:
Bangla         90
English        85
Mathematics    88
dtype: int64


## Pandas DataFrame

In [6]:
# Creating a Dataframe from a dictionary
data = {
    "Name": ["Alice","Bob","Charlie"],
    "Age": [25,30, 40],
    "City": ["Dhaka","London","Berlin"]
}
rows = ["row1","row2","row3"]
df = pd.DataFrame(data, index=rows)
# print("DataFrame From Dictionary:\n\n", df)
df

Unnamed: 0,Name,Age,City
row1,Alice,25,Dhaka
row2,Bob,30,London
row3,Charlie,40,Berlin


In [7]:
# Creating a DataFrame from a list of lists:
data = [["Alice", 25, "Dhaka", 80.0],
        ["Bob", 15, "London", 60],
        ["Charlie", 35, "Dhaka", 65]]

labels = ["Name", "Age", "City", "Marks"]

rows = ["row1","row2","row3"]

df = pd.DataFrame(data, columns= labels, index=rows)

# print("\nDataFrame from List\n", df)
df

Unnamed: 0,Name,Age,City,Marks
row1,Alice,25,Dhaka,80.0
row2,Bob,15,London,60.0
row3,Charlie,35,Dhaka,65.0


In [8]:
## Check the data types
# print("Data Types:\n", df.dtypes)
df.dtypes

Name      object
Age        int64
City      object
Marks    float64
dtype: object

### Dealing with rows and columns

In [9]:
# Adding a new column
df["Salary"]=[5000,30000,500000]
# print("After Adding Salary column:\n", df)
df

Unnamed: 0,Name,Age,City,Marks,Salary
row1,Alice,25,Dhaka,80.0,5000
row2,Bob,15,London,60.0,30000
row3,Charlie,35,Dhaka,65.0,500000


In [10]:
df.loc['row1', 'Salary'] = 10000
print(df)

         Name  Age    City  Marks  Salary
row1    Alice   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   30000
row3  Charlie   35   Dhaka   65.0  500000


In [11]:
df.loc['row2', 'Salary'] -= 2000
df

Unnamed: 0,Name,Age,City,Marks,Salary
row1,Alice,25,Dhaka,80.0,10000
row2,Bob,15,London,60.0,28000
row3,Charlie,35,Dhaka,65.0,500000


In [12]:
# Renaming a column
df.rename(columns={"Salary":"Income"}, inplace=True)
print("\nAfter Renaming Colum:\n",df)


After Renaming Colum:
          Name  Age    City  Marks  Income
row1    Alice   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   28000
row3  Charlie   35   Dhaka   65.0  500000


In [13]:
# Use without inplace=True
new_df = df.rename(columns={"Income":"Salary"})

print(new_df)

print('\n')

print(df)

         Name  Age    City  Marks  Salary
row1    Alice   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   28000
row3  Charlie   35   Dhaka   65.0  500000


         Name  Age    City  Marks  Income
row1    Alice   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   28000
row3  Charlie   35   Dhaka   65.0  500000


In [14]:
df.loc['row1', 'Name'] = 'Noyon'
print(df)

         Name  Age    City  Marks  Income
row1    Noyon   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   28000
row3  Charlie   35   Dhaka   65.0  500000


In [15]:
var = df[df['Name'] == 'Noyon']
print(var)

       Name  Age   City  Marks  Income
row1  Noyon   25  Dhaka   80.0   10000


In [16]:
# Dropping a row
df_dropped = df.drop(index='row2')
print("\nAfter Dropping Row:\n", df_dropped)

print("\n", df)


After Dropping Row:
          Name  Age   City  Marks  Income
row1    Noyon   25  Dhaka   80.0   10000
row3  Charlie   35  Dhaka   65.0  500000

          Name  Age    City  Marks  Income
row1    Noyon   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   28000
row3  Charlie   35   Dhaka   65.0  500000


In [17]:
# Dropping a column Marks
df_dropped_column = df_dropped.drop(columns=["Marks"])
print("\nAfter Dropping a column\n", df_dropped_column)


After Dropping a column
          Name  Age   City  Income
row1    Noyon   25  Dhaka   10000
row3  Charlie   35  Dhaka  500000


### Indexing and Selecting data

In [18]:
# Source dataframe
print("Source DataFrame:\n\n",df)

# Select multiple rows iloc => i -> integer , loc= location
print("\nSelecting Rows by Index:\n\n",df.iloc[:2])

# # Select rows with labels
print("\nSelecting Rows by Label:\n\n",df.loc[:,["Name","Income"]])

Source DataFrame:

          Name  Age    City  Marks  Income
row1    Noyon   25   Dhaka   80.0   10000
row2      Bob   15  London   60.0   28000
row3  Charlie   35   Dhaka   65.0  500000

Selecting Rows by Index:

        Name  Age    City  Marks  Income
row1  Noyon   25   Dhaka   80.0   10000
row2    Bob   15  London   60.0   28000

Selecting Rows by Label:

          Name  Income
row1    Noyon   10000
row2      Bob   28000
row3  Charlie  500000


In [19]:
print(df["Age"])

row1    25
row2    15
row3    35
Name: Age, dtype: int64


In [20]:
# Filter rows where Age > 25
filtered = df[df["Age"] >= 25]
print("Filtered Rows:\n", filtered)

Filtered Rows:
          Name  Age   City  Marks  Income
row1    Noyon   25  Dhaka   80.0   10000
row3  Charlie   35  Dhaka   65.0  500000


In [21]:
# Multiple conditions
filtered = df[(df["Age"] >= 25) | (df["City"] == "Chittagong")]
print("\nFiltered Rows with Multiple Conditions:\n", filtered)


Filtered Rows with Multiple Conditions:
          Name  Age   City  Marks  Income
row1    Noyon   25  Dhaka   80.0   10000
row3  Charlie   35  Dhaka   65.0  500000


### Task 2

1. Create a python dictionary which has columns; Name, Math Marks, Physics Marks with random values.
  * Example: { 'Name': ['No 1 Shakib Khan', 'Dhakar Pola Jalil', 'Bad boy Maruf'], 'Math Marks': [70, 50, 35], 'Physics Marks': [80, 56, 90] }


2.   Make a dataframe from the dictionary

3.   Print all the students' data who passed in Math

4.   Print all the students' data who got gpa 4.00 (>= 80) in Physics

In [27]:
# Your code here
result = { 'Name': ['No 1 Shakib Khan', 'Dhakar Pola Jalil', 'Bad boy Maruf'],
          'Math Marks': [70, 50, 35],
          'Physics Marks': [80, 56, 90]}

my_df = pd.DataFrame(result)

passed_in_math = my_df[my_df['Math Marks'] >= 40]
# passed_in_math

gpa_4 = my_df[(my_df['Physics Marks'] >= 80)]
gpa_4

Unnamed: 0,Name,Math Marks,Physics Marks
0,No 1 Shakib Khan,70,80
2,Bad boy Maruf,35,90


### Working with real data

In [28]:
! wget https://raw.githubusercontent.com/PulockDas/pd-12-resources/refs/heads/master/data.csv

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [44]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [30]:
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [31]:
df.tail()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


In [39]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB
None


In [40]:
import pandas as pd

df.dropna(inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  164 non-null    int64  
 1   Pulse     164 non-null    int64  
 2   Maxpulse  164 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 6.4 KB


In [34]:
df = pd.read_csv('data.csv')

df.fillna(130, inplace = True)
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  164 non-null    int64  
 1   Pulse     164 non-null    int64  
 2   Maxpulse  164 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 6.4 KB


In [42]:
df[df['Calories'] == 130]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories


In [43]:
# Show duplicates
print(df[df.duplicated()])

     Duration  Pulse  Maxpulse  Calories
36         60    102       127     300.0
37         60    100       120     300.0
38         60    100       120     300.0
40         45     90       112     180.1
71         60    109       153     387.6
113        45    100       120     225.3
155        60    111       151     368.5


In [None]:
# Remove duplicates
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


## Python Regular Expression

In [None]:
import re

Key Functions:
1. **findall**(): Returns a list containing all matches
2. **search**(): Returns a Match object if there is a match anywhere in the string
3. **sub**(): Replaces one or many matches with a string

In [None]:
txt = "The rain in Spain"

#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[a-m]", txt)
print(x)

['h', 'e', 'a', 'i', 'i', 'a', 'i']


In [None]:
txt = "That will be 59 dollars"

#Find all digit characters:

x = re.findall("\d", txt)
print(x)

['5', '9']


In [None]:
txt = "hello planet"

#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":

x = re.findall("he..o", txt)
print(x)

['hello']


In [None]:
txt = "hello planet"

#Check if the string starts with 'hello':

x = re.search("^hello", txt)
print(x)

<re.Match object; span=(0, 5), match='hello'>


In [None]:
txt = "hello planet"

#Check if the string ends with 'planet':

x = re.findall("planet$", txt)
if x:
  print("Yes, the string ends with 'planet'")
else:
  print("No match")

Yes, the string ends with 'planet'


In [None]:
txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)
print(x)

The9rain9in Spain


### Task 3
#### Replace any string 'man' of the following text with 'guy'
* Text: "Hey man! What's up?"