## Create a Sample Data Set

In [2]:
import pandas as pd
import numpy as np

data = {
    "Name": ["Alice ", "Bob", "Charlie", "Alice ", "Eve", None],
    "Age": [25, 30, np.nan, 25, 22, 29],
    "City": ["New-York", "Paris", None, "New-York", "Paris", "Paris"],
    "Date": ["2023-01-01", "2023-02-15", "2023-03-10", "2023-01-01", "2023-04-12", None]
}

df = pd.DataFrame(data)
print(df)

      Name   Age      City        Date
0   Alice   25.0  New-York  2023-01-01
1      Bob  30.0     Paris  2023-02-15
2  Charlie   NaN      None  2023-03-10
3   Alice   25.0  New-York  2023-01-01
4      Eve  22.0     Paris  2023-04-12
5     None  29.0     Paris        None


## Dataset Overview

In [3]:
df.shape

(6, 4)

In [4]:
df.head()

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
2,Charlie,,,2023-03-10
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12


In [5]:
df.tail()

Unnamed: 0,Name,Age,City,Date
1,Bob,30.0,Paris,2023-02-15
2,Charlie,,,2023-03-10
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12
5,,29.0,Paris,


In [6]:
df.sample(3,random_state=42)

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
5,,29.0,Paris,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      float64
 2   City    5 non-null      object 
 3   Date    5 non-null      object 
dtypes: float64(1), object(3)
memory usage: 324.0+ bytes


In [8]:
df.describe()

Unnamed: 0,Age
count,5.0
mean,26.2
std,3.271085
min,22.0
25%,25.0
50%,25.0
75%,29.0
max,30.0


In [9]:
df.columns

Index(['Name', 'Age', 'City', 'Date'], dtype='object')

In [10]:
df.nunique()

Unnamed: 0,0
Name,4
Age,4
City,2
Date,4


## Missing Values

In [11]:
df.isnull().sum()

Unnamed: 0,0
Name,1
Age,1
City,1
Date,1


In [12]:
df.dropna() #removes rows with NaN

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12


In [13]:
df.fillna(0) #Replaces NaN with 0

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
2,Charlie,0.0,0,2023-03-10
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12
5,0,29.0,Paris,0


In [14]:
df['Age'].fillna(df['Age'].mean())

Unnamed: 0,Age
0,25.0
1,30.0
2,26.2
3,25.0
4,22.0
5,29.0


In [15]:
df['City'].fillna(df['City'].mode()[0])

Unnamed: 0,City
0,New-York
1,Paris
2,Paris
3,New-York
4,Paris
5,Paris


In [16]:
df.interpolate() # Fills numeric NaNs by linear interpolation


#Use fillna(mean/median/mode) for general datasets.
#Use interpolate() when data has a natural progression (time, numeric series).By default, it uses linear interpolation (straight line between points).
# Example of interpolate() : If you have temperature readings for 1 PM (25°C) and 3 PM (27°C), but missing 2 PM → interpolation will fill ~26°C.

  df.interpolate() # Fills numeric NaNs by linear interpolation


Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
2,Charlie,27.5,,2023-03-10
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12
5,,29.0,Paris,


## Dublicates

In [17]:
df.duplicated().sum()

np.int64(1)

In [18]:
df

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
2,Charlie,,,2023-03-10
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12
5,,29.0,Paris,


In [19]:
df.drop_duplicates()

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
2,Charlie,,,2023-03-10
4,Eve,22.0,Paris,2023-04-12
5,,29.0,Paris,


## Data type and Conversion

In [20]:
df.dtypes

Unnamed: 0,0
Name,object
Age,float64
City,object
Date,object


In [21]:
pd.to_numeric(df['Age'],errors='coerce').astype('Int64')

Unnamed: 0,Age
0,25.0
1,30.0
2,
3,25.0
4,22.0
5,29.0


In [22]:
pd.to_datetime(df['Date'],errors='coerce')

Unnamed: 0,Date
0,2023-01-01
1,2023-02-15
2,2023-03-10
3,2023-01-01
4,2023-04-12
5,NaT


In [23]:
df['City'].astype('category')

#Categorical data type is used for repeated labels (like City names, Gender, Yes/No, etc.).
#Internally, Pandas stores categories as integer codes (0, 1, 2, …) instead of keeping the full string for each row.
#It also stores a mapping (e.g., 0 → Delhi, 1 → Mumbai, 2 → Pune).

Unnamed: 0,City
0,New-York
1,Paris
2,
3,New-York
4,Paris
5,Paris


## String Cleaning

In [24]:
df['Name'].str.strip() # Removes spaces from "Alice "

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,Alice
4,Eve
5,


In [25]:
df['Name'].str.lower()

Unnamed: 0,Name
0,alice
1,bob
2,charlie
3,alice
4,eve
5,


In [26]:
df['City'].str.replace('[^a-zA-Z0-9]','',regex=True) # Removes special characters (New-York → NewYork)

Unnamed: 0,City
0,NewYork
1,Paris
2,
3,NewYork
4,Paris
5,Paris


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      float64
 2   City    5 non-null      object 
 3   Date    5 non-null      object 
dtypes: float64(1), object(3)
memory usage: 324.0+ bytes


In [28]:
df.head()

Unnamed: 0,Name,Age,City,Date
0,Alice,25.0,New-York,2023-01-01
1,Bob,30.0,Paris,2023-02-15
2,Charlie,,,2023-03-10
3,Alice,25.0,New-York,2023-01-01
4,Eve,22.0,Paris,2023-04-12


In [29]:
df['City'] = df['City'].fillna('')  # replace None/NaN with empty string
df['City'] = df['City'].str.replace('[^a-zA-Z0-9]', ' ', regex=True)
df['City'] = df['City'].str.split()

In [30]:
print(df)

      Name   Age         City        Date
0   Alice   25.0  [New, York]  2023-01-01
1      Bob  30.0      [Paris]  2023-02-15
2  Charlie   NaN           []  2023-03-10
3   Alice   25.0  [New, York]  2023-01-01
4      Eve  22.0      [Paris]  2023-04-12
5     None  29.0      [Paris]        None


## Outliers & Numeric Cleaning

In [32]:
# detect outliers using quantiles
print(df['Age'].quantile([0.01,0.99])) #Calculates the 1st percentile and 99th percentile.

# 1% quantile (0.01) → The value below which 1% of the ages lie.
# 99% quantile (0.99) → The value below which 99% of the ages lie.
# Anything below the 1% or above the 99% can be considered an outlier.

0.01    22.12
0.99    29.96
Name: Age, dtype: float64


In [35]:
# trim top 5% extreme values
upper_limit =df['Age'].quantile(0.95)
df['Age_clipped'] =df['Age'].clip(lower=0,upper= upper_limit)
# Any value above upper_limit will be replaced with upper_limit. So it “trims” extreme values to reduce their effect.
print(f"\nAge column after clipping top 5% (upper limit={upper_limit}):")
print(df[['Age', 'Age_clipped']])


Age column after clipping top 5% (upper limit=29.8):
    Age  Age_clipped
0  25.0         25.0
1  30.0         29.8
2   NaN          NaN
3  25.0         25.0
4  22.0         22.0
5  29.0         29.0


In [37]:
filtered_df = df[df['Age']< upper_limit]
filtered_df

Unnamed: 0,Name,Age,City,Date,Age_clipped
0,Alice,25.0,"[New, York]",2023-01-01,25.0
3,Alice,25.0,"[New, York]",2023-01-01,25.0
4,Eve,22.0,[Paris],2023-04-12,22.0
5,,29.0,[Paris],,29.0


###  Other methods
1. IQR Method – Remove values outside Q1 - 1.5 * IQR to Q3 + 1.5 * IQR.
2. Z-score Method – Remove values where |Z| > 3 (far from mean).

## Column and index Managemnet

In [181]:
df.rename(columns={'Name':'FullName'})

Unnamed: 0,FullName,Age,City,Date
0,Alice,25.0,"[New, York]",2023-01-01
1,Bob,30.0,[Paris],2023-02-15
2,Charlie,,[],2023-03-10
3,Alice,25.0,"[New, York]",2023-01-01
4,Eve,22.0,[Paris],2023-04-12
5,,29.0,[Paris],


In [182]:
df.drop(['City'],axis=1)

Unnamed: 0,Name,Age,Date
0,Alice,25.0,2023-01-01
1,Bob,30.0,2023-02-15
2,Charlie,,2023-03-10
3,Alice,25.0,2023-01-01
4,Eve,22.0,2023-04-12
5,,29.0,


In [184]:
df = df.drop([0, 2])
df.reset_index(drop=True)

Unnamed: 0,Name,Age,City,Date
0,Bob,30.0,[Paris],2023-02-15
1,Alice,25.0,"[New, York]",2023-01-01
2,Eve,22.0,[Paris],2023-04-12
3,,29.0,[Paris],


In [186]:
df['AgePlus5']= df['Age'].fillna(0)+5
df.head()

Unnamed: 0,Name,Age,City,Date,AgePlus5
1,Bob,30.0,[Paris],2023-02-15,35.0
3,Alice,25.0,"[New, York]",2023-01-01,30.0
4,Eve,22.0,[Paris],2023-04-12,27.0
5,,29.0,[Paris],,34.0


## Final Checks

In [187]:
df.info() # verify structure

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 1 to 5
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      3 non-null      object 
 1   Age       4 non-null      float64
 2   City      4 non-null      object 
 3   Date      3 non-null      object 
 4   AgePlus5  4 non-null      float64
dtypes: float64(2), object(3)
memory usage: 364.0+ bytes


In [188]:
df.isnull().sum()

Unnamed: 0,0
Name,1
Age,0
City,0
Date,1
AgePlus5,0


In [194]:
df.describe(include='all') # Full overview of categorical + numeric columns

Unnamed: 0,Name,Age,City,Date,AgePlus5
count,3,4.0,4,3,4.0
unique,3,,2,3,
top,Bob,,[Paris],2023-02-15,
freq,1,,3,1,
mean,,26.5,,,31.5
std,,3.696846,,,3.696846
min,,22.0,,,27.0
25%,,24.25,,,29.25
50%,,27.0,,,32.0
75%,,29.25,,,34.25
