In [1]:
# !pip install pandas

Pandas Documentation: https://pandas.pydata.org/docs/

In [2]:
import pandas as pd
import numpy as np

Dataframe: Pandas Object to represent tabular(structured) data

# Create DataFrame

### From Dictionary

In [3]:
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Aisha'],
    'Age': [25, 30, 28, np.nan, 28],
    'Department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'],
    'Salary': [50000, 65000, 60000, 72000, 80000]
}
df_hr = pd.DataFrame(data)
rows, cols = df_hr.shape
print(f"Rows: {rows}, Columns: {cols}")
df_hr

Rows: 5, Columns: 5


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25.0,HR,50000
1,2,Bob,30.0,IT,65000
2,3,Charlie,28.0,Marketing,60000
3,4,Diana,,Finance,72000
4,5,Aisha,28.0,IT,80000


In [4]:
df_hr.columns # Shows all columns
# Accessing a Particular column:
# Use df_hr.Name or df_hr['Name']

# Accessing a group of selected columns:
# df_hr[['Name', 'Salary']]

Index(['ID', 'Name', 'Age', 'Department', 'Salary'], dtype='object')

In [5]:
# Important pandas datatypes:
type(df_hr['Name']), type(df_hr[['Name', 'Salary']])

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [6]:
df_hr.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
df_hr.set_index('Department', inplace=True)
df_hr

Unnamed: 0_level_0,ID,Name,Age,Salary
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HR,1,Alice,25.0,50000
IT,2,Bob,30.0,65000
Marketing,3,Charlie,28.0,60000
Finance,4,Diana,,72000
IT,5,Aisha,28.0,80000


In [8]:
df_hr.loc['IT']

Unnamed: 0_level_0,ID,Name,Age,Salary
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IT,2,Bob,30.0,65000
IT,5,Aisha,28.0,80000


In [9]:
df_hr.reset_index(inplace=True)

In [10]:
df_hr.set_index('ID', inplace=True)
df_hr

Unnamed: 0_level_0,Department,Name,Age,Salary
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,HR,Alice,25.0,50000
2,IT,Bob,30.0,65000
3,Marketing,Charlie,28.0,60000
4,Finance,Diana,,72000
5,IT,Aisha,28.0,80000


In [11]:
df_hr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Department  5 non-null      object 
 1   Name        5 non-null      object 
 2   Age         4 non-null      float64
 3   Salary      5 non-null      int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes


In [12]:
df_hr.isna().sum()

Department    0
Name          0
Age           1
Salary        0
dtype: int64

In [13]:
df_hr['Age'] = df_hr['Age'].fillna(np.floor(df_hr['Age'].mean())) # other ways to fill min, max, median etc
df_hr

Unnamed: 0_level_0,Department,Name,Age,Salary
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,HR,Alice,25.0,50000
2,IT,Bob,30.0,65000
3,Marketing,Charlie,28.0,60000
4,Finance,Diana,27.0,72000
5,IT,Aisha,28.0,80000


### From list of tuples

In [14]:
data = [
    (1, 'Alice', 25, 'HR', 50000),
    (2, 'Bob', 30, np.nan, 65000),
    (3, 'Aisha', 28, 'IT', 80000),
]
df = pd.DataFrame(data, columns=['ID', 'Name', 'Age', 'Department', 'Salary'])
rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")
df

Rows: 3, Columns: 5


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,50000
1,2,Bob,30,,65000
2,3,Aisha,28,IT,80000


In [15]:
### From list of Dictionaries
data = [
    {'ID': 1, 'Name': 'Alice', 'Age': 25, 'Department':'HR', 'Salary': 50000},
    {'ID': 2, 'Name': 'Bob', 'Age': 30, 'Department': np.nan, 'Salary': 65000},
    {'ID': 3, 'Name': 'Aisha', 'Age': 28, 'Department': 'IT', 'Salary': 80000},
]
df = pd.DataFrame(data)
rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")
df

Rows: 3, Columns: 5


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,50000
1,2,Bob,30,,65000
2,3,Aisha,28,IT,80000


### Reading from xlsx

In [16]:
# !pip install openpyxl

In [17]:
# Link to the data used for tutorial from Kaggle - https://www.kaggle.com/datasets/himelsarder/retail-product-dataset-with-missing-values/data
df = pd.read_excel("datasets/synthetic_dataset.xlsx", sheet_name="synthetic_dataset")
print(df.shape)
df.sample(2) # Random 2 rows

(4362, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
508,,,1.802829,Out of Stock,19.0
3279,B,6369.0,1.632623,,45.0


In [18]:
df = pd.read_excel("datasets/synthetic_dataset_without_column_names.xlsx", header=None, names=['Category', 'Price', 'Rating', 'Stock', 'Discount'], na_values={'Discount':[-1], 'Price': [-1], 'Category': 'unknown'})

In [19]:
df.head(2)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548,1.870322,,
1,,3045,4.757798,,38.0


In [20]:
# Alternatively use converters
def fix_unknown_or_negative(cell):
    if cell == 'unknown' or cell == -1:
        return np.nan
df_excel = pd.read_excel("datasets/synthetic_dataset_without_column_names.xlsx", header=None, names=['Category', 'Price', 'Rating', 'Stock', 'Discount'], converters={'Discount':fix_unknown_or_negative, 'Price': fix_unknown_or_negative, 'Category': fix_unknown_or_negative})
df_excel.head(2)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,,1.870322,,
1,,,4.757798,,


### Reading from Parquet

In [21]:
# !pip install pyarrow

In [22]:
# !pip install fastparquet

In [23]:
df = pd.read_parquet("datasets/synthetic_dataset.parquet")
print(df.shape)
df.tail(2) # last 2 rows

(4362, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
4360,D,2999.0,4.425995,,40.0
4361,,,2.184471,,0.0


### Reading from CSV

In [24]:
df = pd.read_csv("datasets/Synthetic_dataset_multiple_headers.csv", header=2, skiprows=0, nrows=2) # consider the top most row as header, we donot skip any rows
# if we have two header and we want to consider the 2nd row as column names set header=1 (consider 2nd row and not 0th and 1st row for column names)
# or skiprows=2 (skip top 2 rows, and start from the next one). We can skip any no. of rows
# read only 2 rows
df

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548,1.870322,,0
1,,3045,4.757798,,38


In [25]:
df = pd.read_csv("datasets/synthetic_dataset_without_column_names.csv", header=None, names=['Category', 'Price', 'Rating', 'Stock', 'Discount'], na_values={'Discount':[-1], 'Price': [-1]})
# if we our data has specific encoding ex: encoding='latin1', 'utf-8' add it
# if we have na_values in a particular column written differently like na_values={'Discount':[-1]} i.e. considering '-1' as NA value for Discount. 
# we cannot have Discount < 0
print(f"Shape: {df.shape}")
# Last 5 rows: df.tail()
# Random 5 rows: df.sample(5)
# Selected indexes: df[5:10]
# First 5 rows
df.head()

Shape: (7, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548,1.870322,,
1,,3045,4.757798,,38.0
2,,4004,,In Stock,
3,,4808,1.492085,,33.0
4,,1817,,Out of Stock,23.0


### Understand the data

The dataset includes:

Category (Categorical): Product category (A, B, C, D)

Price (Numerical): Randomized product prices

Rating (Numerical): Ratings between 1 to 5

Stock (Categorical): Availability status (In Stock, Out of Stock)

Discount (Numerical): Discount percentage

In [29]:
df = pd.read_csv("datasets/synthetic_dataset.csv")
print(df.shape)
df.head()

(4362, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548.0,1.870322,,0.0
1,,3045.0,4.757798,,38.0
2,,4004.0,,In Stock,0.0
3,,4808.0,1.492085,,33.0
4,,1817.0,,Out of Stock,23.0


# Data wrangling/ Data mungling -  Process of cleaning messy data

In [27]:
df.isna().sum()

Category    2748
Price        174
Rating      2050
Stock       1352
Discount     392
dtype: int64

##### Drop NA

In [37]:
print(f"Original: {df.shape}")
df_cleaned = df.dropna(how='all') # when all values in a row are missing
print(f"All empty row: {df_cleaned.shape}")
# or
df_cleaned = df.dropna(thresh=1) # Keep the row with atleast 1 non na value
print(f"Keep the row with atleast 1 non na value: {df_cleaned.shape}")

df_cleaned = df.dropna(thresh=2) # Keep the row with atleast 2 non na value
print(f"Keep the row with atleast 2 non na value: {df_cleaned.shape}")

df_cleaned = df.dropna(how='any') # Drop if any NaN (default)
print(f"Any empty row: {df_cleaned.shape}")
subset=['col1', 'col2']	
df_cleaned = df.dropna(subset=['Price', 'Discount']) # Only consider these columns for NaN checking
print(f"Price/Discount missing: {df_cleaned.shape}")

Original: (4362, 5)
All empty row: (4359, 5)
Keep the row with atleast 1 non na value: (4359, 5)
Keep the row with atleast 2 non na value: (4306, 5)
Any empty row: (540, 5)
Price/Discount missing: (3809, 5)


##### Fill NA

In [39]:
df = df.sort_values(by=["Category"])
df = df.reindex()
df

Unnamed: 0,Category,Price,Rating,Stock,Discount
2608,A,6649.0,,Out of Stock,9.0
2829,A,286.0,,In Stock,22.0
149,A,9825.0,,Out of Stock,22.0
2823,A,6129.0,3.733947,Out of Stock,7.0
2525,A,5528.0,,,35.0
...,...,...,...,...,...
4355,,,,In Stock,46.0
4356,,,,Out of Stock,21.0
4357,,4436.0,4.728335,,49.0
4359,,3283.0,,Out of Stock,9.0


In [None]:
print(f"Original: {df.shape}")

df_cleaned = df.dropna(how='all') # when all values in a row are missing
print(f"All empty row: {df_cleaned.shape}")
# or
df_cleaned = df.dropna(thresh=1) # Keep the row with atleast 1 non na value
print(f"Keep the row with atleast 1 non na value: {df_cleaned.shape}")

df_cleaned = df.dropna(thresh=2) # Keep the row with atleast 2 non na value
print(f"Keep the row with atleast 2 non na value: {df_cleaned.shape}")

df_cleaned = df.dropna(how='any') # Drop if any NaN (default)
print(f"Any empty row: {df_cleaned.shape}")
subset=['col1', 'col2']	
df_cleaned = df.dropna(subset=['Price', 'Discount']) # Only consider these columns for NaN checking
print(f"Price/Discount missing: {df_cleaned.shape}")

In [27]:
print(f"Maximum Price: {df['Price'].max()}") 
print(f"Average Rating: {df['Rating'].mean()}") 
print(f"Minimum Discount: {df['Discount'].min()}") 
print(f"Price deviation: {df['Price'].std()}")

Maximum Price: 9999.0
Average Rating: 3.0382925191086185
Minimum Discount: 0.0
Price deviation: 2839.984813424091


In [28]:
df.describe() # for mathematical columns

Unnamed: 0,Price,Rating,Discount
count,4188.0,2312.0,3970.0
mean,5016.97063,3.038293,24.516625
std,2839.984813,1.143074,14.347164
min,102.0,1.000366,0.0
25%,2628.25,2.06949,12.0
50%,4996.5,3.08206,25.0
75%,7418.0,4.00862,37.0
max,9999.0,4.997818,49.0


# Saving final frame

In [23]:
# To parquet
# df.to_parquet("datasets/final_synthetic_dataset.parquet")
# To excel
df.to_excel("datasets/final_synthetic_dataset.xlsx", sheet_name='retail_prices', index=False, startrow=3, startcol=5)
# Write different datasets to multiple sheets
with pd.ExcelWriter("datasets/final_synthetic_dataset.xlsx") as excel_writer:
    df_hr.to_excel(excel_writer, sheet_name="hr")
    df_excel.to_excel(excel_writer, sheet_name="retail_prices")
# To CSV
# df.to_csv("datasets/final_synthetic_dataset.csv", index=False, columns = ['Department', 'EmployeeNumber']) # saving selected columns only
# df.to_csv("datasets/final_ibm_hr_analytics.csv", index=False, header=False) # save without column names