# Data Science Process
1. Define the Problem
2. Data Collection
3. EDA (Exploratory Data Analysis)
4. Data Cleaning and Pre-Processing
5. Feature Engineering
6. Modelling
7. Evaluation
8. Deployment

# Exploratory Data Analysis 
1. Load and Understand the structure of the data
2. Clean the data
3. Get the statistical summaries
4. Data Visualization

<b>Pandas are used for EDA.</b>
<b>MatplotLib, Seaborn are used for Data Visualization.</b>

<b>Pandas</b>: Data analysis and manipulation tool.

In [1]:
import pandas as pd

In [2]:
# Pandas -> Series, DataFrames
# Series -> 1-Dimensional labelled array
# DataFrame -> 2-Dimension Labelled array (Like a table)

info = {
    "Name": ["Gojo", "Toji", "Adi"],
    "Marks": [95, 78, 99]
}

df = pd.DataFrame(info)
print(df)

# The Dictionary Keys are the Headers of the DataFrame.
# 0, 1 and 2 are the labels in the DataFrame.

   Name  Marks
0  Gojo     95
1  Toji     78
2   Adi     99


In [3]:
df

Unnamed: 0,Name,Marks
0,Gojo,95
1,Toji,78
2,Adi,99


In [4]:
# Series - 1D Labelled Array
s = pd.Series([1, 2, 3, 4, 5])
print(s)
print(type(s))

# Index
print(s[0])
print(s[2])

# Index and labels are not the same. We can also have custom labels.

0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>
1
3


In [5]:
# Adding a custom label
s = pd.Series([22, 23, 24, 25, 26], index = ["Adi", "Gojo", "Toji", "Nanami", "Sukuna"])
print(s)

# Accessing values using labels
print(s["Adi"])
print(s["Gojo"])

# Printing all the labels 
print(s.index)

Adi       22
Gojo      23
Toji      24
Nanami    25
Sukuna    26
dtype: int64
22
23
Index(['Adi', 'Gojo', 'Toji', 'Nanami', 'Sukuna'], dtype='object')


# Properties of Series:
- Homogenous
- Vectorized Operations
- Handle Missing values with NaN
- Mutable values, immutable size

In [6]:
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([10, 20, 30, 40, 50])

print(s1 + s2)

s1[0] = 100
changed_s1 = s1.drop(0)

print(s1)
print(changed_s1)

0    11
1    22
2    33
3    44
4    55
dtype: int64
0    100
1      2
2      3
3      4
4      5
dtype: int64
1    2
2    3
3    4
4    5
dtype: int64


In [7]:
# DataFrame (2D Labelled Array)

info = {
    "Name": ["Adam", "Eve", "Bob"],
    "Age": [23, 24, 25],
    "GPA": [9.5, 8.6, 7.2]
}

df = pd.DataFrame(info)
print(df) 
print(df.index)

   Name  Age  GPA
0  Adam   23  9.5
1   Eve   24  8.6
2   Bob   25  7.2
RangeIndex(start=0, stop=3, step=1)


In [8]:
# Lists of Lists 
df = pd.DataFrame([["Adam", 23], ["Bob", 23], ["Eve", 23]], columns = ["Name", "Age"])
print(df)

   Name  Age
0  Adam   23
1   Bob   23
2   Eve   23


In [9]:
# Using NumPy Arrays to Create a Frame

import numpy as np
np_arr = np.array([[1, 2, 3], [4, 5, 6]])

df = pd.DataFrame(np_arr, columns = ["A", "B", "C"])
print(df)

   A  B  C
0  1  2  3
1  4  5  6


In [10]:
# Usage with CSV

df = pd.read_csv("employee_data.csv")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


In [11]:
# JSON Data

df = pd.read_json("employee_data.json")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


# Pandas (DataFrame Methods)
- df.head()
- df.tail()
- df.sample()
- df.info()
- df.shape
- df.describe()
- df.columns
- df.nunique()

In [12]:
df = pd.read_json("employee_data.json")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


In [13]:
# Methods

df.head() # First 5 Rows
df.tail() # Last 5 Rows

df.sample() # Just a random row from the table

df.info() # Summary of the data

df.shape # Returns a tuple of rows and 

df.describe() # Gives detailed statistical summaries for numerical value tables

df.columns # All the columns in the table

df.nunique() # Gives unique rows
'''
ID            10
Name          10
Age           10
Department     4
Salary        10
dtype: int64
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          10 non-null     int64 
 1   Name        10 non-null     object
 2   Age         10 non-null     int64 
 3   Department  10 non-null     object
 4   Salary      10 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 532.0+ bytes


'\nID            10\nName          10\nAge           10\nDepartment     4\nSalary        10\ndtype: int64\n'

In [14]:
df

Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,55000
1,2,Bob,32,IT,72000
2,3,Charlie,28,Finance,48000
3,4,David,45,Marketing,91000
4,5,Eva,38,IT,65000
5,6,Frank,29,Finance,50000
6,7,Grace,41,HR,82000
7,8,Hannah,26,Marketing,47000
8,9,Ian,35,IT,75000
9,10,Julia,30,Finance,60000


In [15]:
# AQI Dataset 

df = pd.read_csv("globalAirQuality.csv")

df

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.120,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.250,82.553,26.935,9.538,23.320,0.977,84,31.833,62.783,9.650
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.230,158,23.140,89.153,8.956
4,2025-11-04 22:25:17.554219,US,New York,40.713,-74.006,21.083,66.423,38.997,6.919,45.615,1.085,97,13.632,76.499,4.017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,2025-11-19 13:25:17.554219,CH,Zurich,47.377,8.542,27.899,74.179,41.474,6.677,50.869,1.028,103,7.079,52.443,7.452
17996,2025-11-19 14:25:17.554219,CH,Zurich,47.377,8.542,2.950,47.988,42.235,2.821,35.551,0.644,105,28.734,85.678,4.496
17997,2025-11-19 15:25:17.554219,CH,Zurich,47.377,8.542,61.347,72.908,46.976,5.763,66.492,0.947,122,21.951,72.311,9.660
17998,2025-11-19 16:25:17.554219,CH,Zurich,47.377,8.542,40.722,95.152,32.957,5.524,53.193,0.868,95,24.042,31.880,2.642


In [16]:
df.head()

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.25,82.553,26.935,9.538,23.32,0.977,84,31.833,62.783,9.65
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.23,158,23.14,89.153,8.956
4,2025-11-04 22:25:17.554219,US,New York,40.713,-74.006,21.083,66.423,38.997,6.919,45.615,1.085,97,13.632,76.499,4.017


In [17]:
df.tail()

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
17995,2025-11-19 13:25:17.554219,CH,Zurich,47.377,8.542,27.899,74.179,41.474,6.677,50.869,1.028,103,7.079,52.443,7.452
17996,2025-11-19 14:25:17.554219,CH,Zurich,47.377,8.542,2.95,47.988,42.235,2.821,35.551,0.644,105,28.734,85.678,4.496
17997,2025-11-19 15:25:17.554219,CH,Zurich,47.377,8.542,61.347,72.908,46.976,5.763,66.492,0.947,122,21.951,72.311,9.66
17998,2025-11-19 16:25:17.554219,CH,Zurich,47.377,8.542,40.722,95.152,32.957,5.524,53.193,0.868,95,24.042,31.88,2.642
17999,2025-11-19 17:25:17.554219,CH,Zurich,47.377,8.542,25.83,30.411,35.317,4.336,66.246,0.848,88,8.529,59.104,4.403


In [18]:
df.describe()

Unnamed: 0,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
count,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0
mean,23.06598,37.65556,40.369131,70.152228,32.055176,6.035508,48.0651,0.800595,104.645556,21.510251,57.714351,5.28391
std,26.156536,78.600701,17.64745,24.99944,13.82068,2.45479,14.950849,0.250254,25.61607,9.509444,18.844908,2.741712
min,-37.814,-123.121,0.025,0.061,0.013,0.003,0.114,0.0,16.0,5.0,25.002,0.5
25%,12.972,2.352,27.9045,53.1255,22.3625,4.36075,38.0285,0.633,87.0,13.35775,41.32,2.937
50%,29.232,42.146,40.2865,69.961,32.0195,6.026,48.142,0.8005,103.0,21.4555,57.847,5.297
75%,41.008,103.82,52.43625,87.2565,41.36425,7.71525,58.2585,0.969,121.0,29.68825,74.23475,7.662
max,60.17,174.763,115.683,161.81,90.019,16.559,103.016,1.832,231.0,37.998,89.997,9.999


In [19]:
# Selecting Data

# Column-Wise Selection

# Single Column
df["city"]

# Multiple Columns
df[["city", "aqi"]]


Unnamed: 0,city,aqi
0,New York,108
1,New York,90
2,New York,84
3,New York,158
4,New York,97
...,...,...
17995,Zurich,103
17996,Zurich,105
17997,Zurich,122
17998,Zurich,95


# Selecting Data 
- Select Columns
- Select Rows (Label and Index Based) - loc & iloc
- Select Rows and Columns
- Select single scalar value - at & iat

In [20]:
# Getting the first row
df.loc[0]

timestamp      2025-11-04 18:25:17.554219
country                                US
city                             New York
latitude                           40.713
longitude                         -74.006
pm25                               50.295
pm10                              108.938
no2                                27.998
so2                                 6.539
o3                                 52.568
co                                  1.096
aqi                                   108
temperature                        18.504
humidity                           70.168
wind_speed                          3.725
Name: 0, dtype: object

In [21]:
df.loc[0:2] # Starting Index : Ending Index (inclusive)

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.25,82.553,26.935,9.538,23.32,0.977,84,31.833,62.783,9.65


In [22]:
# loc selects data by label
# iloc selects by integer position
df.loc[2]
df.iloc[0:2] # End index is exclusive

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969


In [23]:
# Individual Cells
df.loc[0, "aqi"]
df.loc[0:2, ["country", "city", "latitude", "longitude"]] # Cannot use this for iloc as it requires a numeric index
# iloc version
# df.iloc[0:3, 1:5] 

Unnamed: 0,country,city,latitude,longitude
0,US,New York,40.713,-74.006
1,US,New York,40.713,-74.006
2,US,New York,40.713,-74.006


In [24]:
# Select single scalar value - at & iat
df.at[0, "city"]
df.iat[0, 2]

'New York'

In [25]:
# Filtering of Data

df[(df["aqi"] > 100) & (df["temperature"] > 30)]

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
6,2025-11-05 00:25:17.554219,US,New York,40.713,-74.006,77.690,65.198,20.302,7.641,62.687,0.734,155,36.729,47.651,4.542
7,2025-11-05 01:25:17.554219,US,New York,40.713,-74.006,57.816,111.709,34.533,6.945,41.304,0.771,115,37.891,53.314,7.605
14,2025-11-05 08:25:17.554219,US,New York,40.713,-74.006,38.815,121.394,25.969,4.112,44.730,1.124,121,34.481,81.790,3.576
17,2025-11-05 11:25:17.554219,US,New York,40.713,-74.006,63.552,57.838,13.754,4.427,42.251,0.997,127,36.193,62.906,7.097
25,2025-11-05 19:25:17.554219,US,New York,40.713,-74.006,56.954,75.045,29.837,9.541,54.198,1.170,113,31.289,43.279,5.264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17947,2025-11-17 13:25:17.554219,CH,Zurich,47.377,8.542,53.938,75.183,16.255,7.263,67.244,0.457,107,32.584,81.544,7.101
17950,2025-11-17 16:25:17.554219,CH,Zurich,47.377,8.542,70.255,60.620,32.081,2.065,47.803,0.852,140,36.885,34.173,1.748
17975,2025-11-18 17:25:17.554219,CH,Zurich,47.377,8.542,33.406,98.160,44.654,0.807,45.160,0.737,111,30.761,76.073,3.570
17976,2025-11-18 18:25:17.554219,CH,Zurich,47.377,8.542,73.493,89.558,9.313,4.768,51.505,0.745,146,30.909,81.573,1.024


In [26]:
df[ df["aqi"] > 100][["city", "aqi"]]

Unnamed: 0,city,aqi
0,New York,108
3,New York,158
6,New York,155
7,New York,115
8,New York,121
...,...,...
17991,Zurich,104
17993,Zurich,153
17995,Zurich,103
17996,Zurich,105


In [27]:
aqi_data = df[ df["aqi"] > 100][["city", "aqi"]]

aqi_data.iloc[0]  

city    New York
aqi          108
Name: 0, dtype: object

# Query Method
- Expression as a String
- Column based reference
- Backtics for col_name with space or special chars
- Operators (&, |, -, >=, >, <=, <, ==, etc)
- @ to reference Python vars
- Chained Comparisons

In [28]:
# Query (Returns a copy and not a view)
df.query("aqi > 100 & temperature > 30")

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
6,2025-11-05 00:25:17.554219,US,New York,40.713,-74.006,77.690,65.198,20.302,7.641,62.687,0.734,155,36.729,47.651,4.542
7,2025-11-05 01:25:17.554219,US,New York,40.713,-74.006,57.816,111.709,34.533,6.945,41.304,0.771,115,37.891,53.314,7.605
14,2025-11-05 08:25:17.554219,US,New York,40.713,-74.006,38.815,121.394,25.969,4.112,44.730,1.124,121,34.481,81.790,3.576
17,2025-11-05 11:25:17.554219,US,New York,40.713,-74.006,63.552,57.838,13.754,4.427,42.251,0.997,127,36.193,62.906,7.097
25,2025-11-05 19:25:17.554219,US,New York,40.713,-74.006,56.954,75.045,29.837,9.541,54.198,1.170,113,31.289,43.279,5.264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17947,2025-11-17 13:25:17.554219,CH,Zurich,47.377,8.542,53.938,75.183,16.255,7.263,67.244,0.457,107,32.584,81.544,7.101
17950,2025-11-17 16:25:17.554219,CH,Zurich,47.377,8.542,70.255,60.620,32.081,2.065,47.803,0.852,140,36.885,34.173,1.748
17975,2025-11-18 17:25:17.554219,CH,Zurich,47.377,8.542,33.406,98.160,44.654,0.807,45.160,0.737,111,30.761,76.073,3.570
17976,2025-11-18 18:25:17.554219,CH,Zurich,47.377,8.542,73.493,89.558,9.313,4.768,51.505,0.745,146,30.909,81.573,1.024


In [29]:
# With Chaining
df.query("aqi > 100 & temperature > 30")[["city", "aqi"]]

Unnamed: 0,city,aqi
6,New York,155
7,New York,115
14,New York,121
17,New York,127
25,New York,113
...,...,...
17947,Zurich,107
17950,Zurich,140
17975,Zurich,111
17976,Zurich,146


In [30]:
# Using Variables 
aqi_value = 100
df.query("aqi > @aqi_value & temperature > 30")[["city", "aqi"]]

Unnamed: 0,city,aqi
6,New York,155
7,New York,115
14,New York,121
17,New York,127
25,New York,113
...,...,...
17947,Zurich,107
17950,Zurich,140
17975,Zurich,111
17976,Zurich,146


# Cleaning Data (Handle Missing Values)
- isnull() / isna()
- isnull().sum()
- dropna()
- fillna(value)
- ffill()
- bfill()

In [31]:
# Clean Data - Missing Values
# isnull() returns true wherever there is a null value 

df = pd.read_csv("raw_data.csv")
df.isnull()

Unnamed: 0,id,name,age,country,gender,income
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,True,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False
6,False,True,False,False,False,False
7,False,False,False,True,False,False
8,False,False,False,False,False,False
9,False,False,True,False,False,False


In [32]:
# Count the number of missing values per column using chaining functions
df.isnull().sum()

id         0
name       1
age        3
country    1
gender     1
income     1
dtype: int64

In [33]:
# dropna() drops all the null values from the dataframe (removes the entire row)
df.dropna()

# To drop the column (Change the axis)
df.dropna(axis = 1)


Unnamed: 0,id
0,1
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [34]:
# fillna(value) will fill all missing values
df.fillna(0) #Fill all missing values with 0

# To Fill a specific column with 0s
df["age"].fillna(0)

# Fill with mean values 
cleaned_data = df.copy()
age_mean = cleaned_data["age"].mean()
cleaned_data["age"] = df["age"].fillna(age_mean)

cleaned_data

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,32.75,Canada,Female,62000.0
3,3,Alex,32.75,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,32.75,Mexico,Male,45000.0


In [35]:
# ffill() --> Forward Fill
df.ffill()

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,29.0,Canada,Female,62000.0
3,3,Alex,29.0,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,47000.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Li Wei,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,India,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,29.0,Mexico,Male,45000.0


In [36]:
## bfill() --> Backward Fill
df.bfill()

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,34.0,Canada,Female,62000.0
3,3,Alex,34.0,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,51000.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Ahmed Khan,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,USA,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,31.0,Mexico,Male,45000.0


# Handle Duplicate Values
- duplicated()
- drop_duplicates()

In [39]:
# Handle Duplicates

# Check Duplicate Values
df.duplicated()

# Drop Duplicates 
df.drop_duplicates(inplace = True)
# inplace = True changes the value in the original data

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0
10,10,Emily Davis,31.0,USA,,58000.0


In [40]:
df

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0
10,10,Emily Davis,31.0,USA,,58000.0


# Handle Data Types and date-time
- dtypes
- astype(newType)
- to_datetime()

In [41]:
# Data Types

df.dtypes  # Datatypes of the columns

id           int64
name        object
age        float64
country     object
gender      object
income     float64
dtype: object

In [47]:
# To change the data types

df2 = df.copy()

df2 = df2.fillna(0)
df2 = df2["age"].astype("int64").copy()
df2.dtypes

dtype('int64')

In [51]:
date_str = pd.Series("2026-12-31")
type(date_str.dtypes)
# To convert to Pandas to_datetime()
date_str = pd.Series([pd.to_datetime("2026-12-31")])
type(date_str.dtypes)

numpy.dtypes.DateTime64DType

# Handle Strings 
- .str.lower(), .str.upper() & .capitalize()
- .str.strip()
- .str.strip("")
- .str.contains(val, case = False)