In [0]:
# import pandas and numpy

import pandas as pd
import numpy as np

In [0]:
salaries  = pd.Series([50000, 80000, 100000, 20000, 700000])

In [0]:
display(salaries)

In [0]:
type(salaries)

**_Interview Question_**

Q : Difference between List and Series?

Answer : List - A list is just a collection of values

Series - A series has index + values and supports vectorized operations, missing values, and metadata.


In [0]:
pd_employee = pd.Series([50000, 90000, 40000, 70000, 30000], index= ['anil', 'steve', 'stuti', 'anurag', 'virat'])

In [0]:
display(pd_employee)

In [0]:
display(pd_employee['anil'])

**_Series with Mixed Types_**

Q : Can Series hold different types of values?

Answer : Yes, unlike Numpy arrays, a pandas series can hold mixed data types, but it defaults to the most generic type (e.g. object). 

In [0]:
Array_details = pd.Series(['Anurag', 50000, 'Data Engineer', 'Bengaluru'])

In [0]:
display(Array_details)

**_Apply Math on Series_**

In [0]:
Series_Salaries = pd.Series([50000, 80000, 30000, 70000, 40000])
updated_Series_Salaries = Series_Salaries * 1.20 # 20% hike on salary

In [0]:
display(Series_Salaries)

In [0]:
display(updated_Series_Salaries)

**_Handling Missing Values in Series_**

In [0]:
series_data = pd.Series([100, None, 300])

In [0]:
#original Data
display(series_data)

In [0]:
# check missing value in series array
display(series_data.isnull())

In [0]:
# update missing value in series array

series_data.fillna(value=400, inplace=True)

In [0]:
# check value in series array

display(series_data)

**_Follow-up Q: Why does pandas support None?_**

Answer : In real data, missing values are common. Pandas treats None or np.nan as null and provides build-in tools to handle them.

**_DataFrames in Pandas_**

A dataframe is a 2D labeled table or rows and columns. Think of it is an in-memory Excel Sheet or a SQL table

In [0]:
dict_data = {
    'Name': ['Anurag', 'Stuti', 'Steve'], 
    'Salary': [50000, 60000, 70000], 
    'Designation': ['Data Engineer', 'Data Scientist', 'Data Analyst'], 
    'Location': ['India', 'USA', 'Canada']
    }

In [0]:
display(dict_data)

In [0]:
# create   dataframe from dictionary

df_data = pd.DataFrame(dict_data)
display(df_data)

In [0]:
print(df_data)

**_Follow-up Q : Why Dictionary to DataFrame conversion useful?_**

Answer : Many real-world APIs or config files return data as dictionaries; converting them to DataFrames helps in processing.  

**_Access Columns and Rows_**

In [0]:
display(df_data.Salary)

In [0]:
display(df_data[['Name', 'Salary']])

In [0]:
# access rows label data
#using label
display(df_data.loc[0:1])

In [0]:
# access rows label data
#using label
display(df_data.loc[0:1, 'Designation'])

In [0]:
# access rows label data
#using index
display(df_data.iloc[0])

**__Follow-up_ Q : Difference between .loc[] and .iloc[]?_**

Answer : .loc[] is label-based (row names/index)

.iloc[] is position-based (integer index)

In [0]:
# increase 20% salary

df_data_bonus = df_data['Salary'] * 1.20
display(df_data_bonus)

**_Cleaning Data: The Hidden Hero of Data Engineer_**

- 80% of Data Engineer work = Cleaning + Trasforming Data
- Today: Nulls, Filtering, Merging and More.
- Get datasets ready for analysis

**_Handling Nulls [Missing Values]_**
- .isnull(), .notnull()  --> check for nulls
- .dropna()  --> remove nulls
- .fillna  --> replace nulls

**_Example:_**
- df.isnull()
- df.notnull()
- df.dropna()
- df.fillna()

**_Handling Outliers_**
- Detect using .describe() (check min/max)
- Remove/Filter extreme values.

**_Example:_**

df = df[df[salary] < 20000]

In [0]:
# read csv file from workspace and display it

df_employee = pd.read_csv('/Volumes/workspace/default/dbfs/pandas_employees.csv')
display(df_employee)

In [0]:
# want to see top rows from df
display(df_employee.head(2))

In [0]:
# want to see bottom rows from df
display(df_employee.tail(2))

In [0]:
# check total columns and rows
display(df_employee.shape)

In [0]:
# check complete info about df
display(df_employee.info())

In [0]:
# check complete info about df
display(df_employee.describe())

In [0]:
# check values
display(df_employee.isnull().sum())

**_Handling Nulls in Pandas DataFrame_**

In [0]:
# deleting null containing rows from particular column
df_employee_drop = df_employee.dropna(subset= ['name', 'email'])

In [0]:
display(df_employee_drop)

In [0]:
# filling null values for department column
df_employee['department'] = df_employee['department'].fillna('unknown')
display(df_employee)

In [0]:
# filling null values for salary column
df_employee['salary'] = df_employee['salary'].fillna(df_employee['salary'].median())
display(df_employee)

In [0]:
# convert and clean date column
df_employee['join_date'] = pd.to_datetime(df_employee['join_date'], errors = 'coerce') # convert invalid date to NAT
df_employee['join_date'] = df_employee['join_date'].fillna(method= 'ffill') # fill NaT with forward fill

**_Handling Outliers_**


In [0]:
display(df_employee)

In [0]:
display(df_employee['salary'].describe())

In [0]:
display(df_employee['salary'].median())

In [0]:
# check salary less than 20LPA
df_employee[df_employee['salary'] < 200000]

In [0]:
# check salary greated than 50K
df_employee[df_employee['salary'] > 50000]

In [0]:
# check salary greated than 50K and sort in desc order
df_employee[df_employee['salary'] > 50000].sort_values(by = 'salary', ascending = False)

**__Follow-up Q : Why use dropna() vs fillna()?_**

**_Answer : _**
- dropna() is used when missing data makes the row unusable.
- fillna() is used when reasonable default (mean, median) can be applied.


**_Uploading Sales Messy Data_**

In [0]:
df_sales_messy = pd.read_csv('/Volumes/workspace/default/dbfs/pandas_sales_messy.csv')
display(df_sales_messy)

In [0]:
# check top 5 rows
display(df_sales_messy.head())

In [0]:
# check bottom 5 rows
display(df_sales_messy.tail())

In [0]:
#check null values
display(df_sales_messy.isnull().sum())

In [0]:
#check data types of columns
display(df_sales_messy.dtypes)

In [0]:
# check product column to remove space
display(df_sales_messy['product'])

In [0]:
# remove extra spaces from product cloumns
df_sales_messy['product'] = df_sales_messy['product'].str.strip()

In [0]:
# update 1st letters to make capital
df_sales_messy['product'] = df_sales_messy['product'].str.capitalize()

In [0]:
display(df_sales_messy.head())

In [0]:
# check only unique values 
display(sorted(df_sales_messy['product'].unique(), reverse= False))

In [0]:
# check only unique values 
print(df_sales_messy['product'].unique())

In [0]:
# Descrive Amount column
df_sales_messy['amount'].describe()

In [0]:
# display Amount Column Values before filling
display(df_sales_messy['amount'].head(10))

In [0]:
# Handle Amount column - Filling the Missing Values with median
df_sales_messy['amount'] = df_sales_messy['amount'].fillna(df_sales_messy['amount'].median())

In [0]:
# display Amount Column Values after filling
display(df_sales_messy.head(10))

In [0]:
# display join_date column before formatting
display(df_sales_messy['date'].head(10))

In [0]:
# fix the date column - parse and clean invalid entries
df_sales_messy['date'] = pd.to_datetime(df_sales_messy['date'], errors= 'coerce')

In [0]:
# display join_date column after formatting
display(df_sales_messy.head(10))

In [0]:
# check null dates before date filling
display(df_sales_messy['date'].isnull().sum())

In [0]:
# fill the null date
df_sales_messy['date'] = df_sales_messy['date'].fillna(method= 'ffill')

In [0]:
# check null dates after date filling
display(df_sales_messy['date'].isnull().sum())

In [0]:
# check null dates after date filling
display(df_sales_messy.head(10))

In [0]:
# add some usefull columns for analytics like  - year, month, day
df_sales_messy['year'] = df_sales_messy['date'].dt.year
df_sales_messy['month'] = df_sales_messy['date'].dt.month
df_sales_messy['day'] = df_sales_messy['date'].dt.day

In [0]:
# display DF with new columns
display(df_sales_messy.head(10))

**_Final Step is away is to do a Sanity Check_**

In [0]:
print(df_sales_messy.info()) # to check the data types and null values
print(df_sales_messy.describe())

In [0]:
df_sales_messy.to_csv('/Volumes/workspace/default/dbfs/pandas_sales_messy_cleaned_final.csv', index= False)