# Data Science Process
1. Define the Problem
2. Data Collection
3. EDA (Exploratory Data Analysis)
4. Data Cleaning and Pre-Processing
5. Feature Engineering
6. Modelling
7. Evaluation
8. Deployment

# Exploratory Data Analysis 
1. Load and Understand the structure of the data
2. Clean the data
3. Get the statistical summaries
4. Data Visualization

<b>Pandas are used for EDA.</b>
<b>MatplotLib, Seaborn are used for Data Visualization.</b>

<b>Pandas</b>: Data analysis and manipulation tool.

In [7]:
import pandas as pd

In [8]:
# Pandas -> Series, DataFrames
# Series -> 1-Dimensional labelled array
# DataFrame -> 2-Dimension Labelled array (Like a table)

info = {
    "Name": ["Gojo", "Toji", "Adi"],
    "Marks": [95, 78, 99]
}

df = pd.DataFrame(info)
print(df)

# The Dictionary Keys are the Headers of the DataFrame.
# 0, 1 and 2 are the labels in the DataFrame.

   Name  Marks
0  Gojo     95
1  Toji     78
2   Adi     99


In [9]:
df

Unnamed: 0,Name,Marks
0,Gojo,95
1,Toji,78
2,Adi,99


In [10]:
# Series - 1D Labelled Array
s = pd.Series([1, 2, 3, 4, 5])
print(s)
print(type(s))

# Index
print(s[0])
print(s[2])

# Index and labels are not the same. We can also have custom labels.

0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>
1
3


In [11]:
# Adding a custom label
s = pd.Series([22, 23, 24, 25, 26], index = ["Adi", "Gojo", "Toji", "Nanami", "Sukuna"])
print(s)

# Accessing values using labels
print(s["Adi"])
print(s["Gojo"])

# Printing all the labels 
print(s.index)

Adi       22
Gojo      23
Toji      24
Nanami    25
Sukuna    26
dtype: int64
22
23
Index(['Adi', 'Gojo', 'Toji', 'Nanami', 'Sukuna'], dtype='object')


# Properties of Series:
- Homogenous
- Vectorized Operations
- Handle Missing values with NaN
- Mutable values, immutable size

In [15]:
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([10, 20, 30, 40, 50])

print(s1 + s2)

s1[0] = 100
changed_s1 = s1.drop(0)

print(s1)
print(changed_s1)

0    11
1    22
2    33
3    44
4    55
dtype: int64
0    100
1      2
2      3
3      4
4      5
dtype: int64
1    2
2    3
3    4
4    5
dtype: int64


In [17]:
# DataFrame (2D Labelled Array)

info = {
    "Name": ["Adam", "Eve", "Bob"],
    "Age": [23, 24, 25],
    "GPA": [9.5, 8.6, 7.2]
}

df = pd.DataFrame(info)
print(df) 
print(df.index)

   Name  Age  GPA
0  Adam   23  9.5
1   Eve   24  8.6
2   Bob   25  7.2
RangeIndex(start=0, stop=3, step=1)


In [22]:
# Lists of Lists 
df = pd.DataFrame([["Adam", 23], ["Bob", 23], ["Eve", 23]], columns = ["Name", "Age"])
print(df)

   Name  Age
0  Adam   23
1   Bob   23
2   Eve   23


In [23]:
# Using NumPy Arrays to Create a Frame

import numpy as np
np_arr = np.array([[1, 2, 3], [4, 5, 6]])

df = pd.DataFrame(np_arr, columns = ["A", "B", "C"])
print(df)

   A  B  C
0  1  2  3
1  4  5  6


In [24]:
# Usage with CSV

df = pd.read_csv("employee_data.csv")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


In [26]:
# JSON Data

df = pd.read_json("employee_data.json")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


# Pandas (DataFrame Methods)
- df.head()
- df.tail()
- df.sample()
- df.info()
- df.shape
- df.describe()
- df.columns
- df.nunique()

In [27]:
df = pd.read_json("employee_data.json")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


In [43]:
# Methods

df.head() # First 5 Rows
df.tail() # Last 5 Rows

df.sample() # Just a random row from the table

df.info() # Summary of the data

df.shape # Returns a tuple of rows and 

df.describe() # Gives detailed statistical summaries for numerical value tables

df.columns # All the columns in the table

df.nunique() # Gives unique rows
'''
ID            10
Name          10
Age           10
Department     4
Salary        10
dtype: int64
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          10 non-null     int64 
 1   Name        10 non-null     object
 2   Age         10 non-null     int64 
 3   Department  10 non-null     object
 4   Salary      10 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 532.0+ bytes


ID            10
Name          10
Age           10
Department     4
Salary        10
dtype: int64

In [32]:
df

Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,55000
1,2,Bob,32,IT,72000
2,3,Charlie,28,Finance,48000
3,4,David,45,Marketing,91000
4,5,Eva,38,IT,65000
5,6,Frank,29,Finance,50000
6,7,Grace,41,HR,82000
7,8,Hannah,26,Marketing,47000
8,9,Ian,35,IT,75000
9,10,Julia,30,Finance,60000


In [44]:
# AQI Dataset 

df = pd.read_csv("globalAirQuality.csv")

df

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.120,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.250,82.553,26.935,9.538,23.320,0.977,84,31.833,62.783,9.650
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.230,158,23.140,89.153,8.956
4,2025-11-04 22:25:17.554219,US,New York,40.713,-74.006,21.083,66.423,38.997,6.919,45.615,1.085,97,13.632,76.499,4.017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,2025-11-19 13:25:17.554219,CH,Zurich,47.377,8.542,27.899,74.179,41.474,6.677,50.869,1.028,103,7.079,52.443,7.452
17996,2025-11-19 14:25:17.554219,CH,Zurich,47.377,8.542,2.950,47.988,42.235,2.821,35.551,0.644,105,28.734,85.678,4.496
17997,2025-11-19 15:25:17.554219,CH,Zurich,47.377,8.542,61.347,72.908,46.976,5.763,66.492,0.947,122,21.951,72.311,9.660
17998,2025-11-19 16:25:17.554219,CH,Zurich,47.377,8.542,40.722,95.152,32.957,5.524,53.193,0.868,95,24.042,31.880,2.642


In [45]:
df.head()

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.25,82.553,26.935,9.538,23.32,0.977,84,31.833,62.783,9.65
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.23,158,23.14,89.153,8.956
4,2025-11-04 22:25:17.554219,US,New York,40.713,-74.006,21.083,66.423,38.997,6.919,45.615,1.085,97,13.632,76.499,4.017


In [46]:
df.tail()

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
17995,2025-11-19 13:25:17.554219,CH,Zurich,47.377,8.542,27.899,74.179,41.474,6.677,50.869,1.028,103,7.079,52.443,7.452
17996,2025-11-19 14:25:17.554219,CH,Zurich,47.377,8.542,2.95,47.988,42.235,2.821,35.551,0.644,105,28.734,85.678,4.496
17997,2025-11-19 15:25:17.554219,CH,Zurich,47.377,8.542,61.347,72.908,46.976,5.763,66.492,0.947,122,21.951,72.311,9.66
17998,2025-11-19 16:25:17.554219,CH,Zurich,47.377,8.542,40.722,95.152,32.957,5.524,53.193,0.868,95,24.042,31.88,2.642
17999,2025-11-19 17:25:17.554219,CH,Zurich,47.377,8.542,25.83,30.411,35.317,4.336,66.246,0.848,88,8.529,59.104,4.403


In [47]:
df.describe()

Unnamed: 0,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
count,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0
mean,23.06598,37.65556,40.369131,70.152228,32.055176,6.035508,48.0651,0.800595,104.645556,21.510251,57.714351,5.28391
std,26.156536,78.600701,17.64745,24.99944,13.82068,2.45479,14.950849,0.250254,25.61607,9.509444,18.844908,2.741712
min,-37.814,-123.121,0.025,0.061,0.013,0.003,0.114,0.0,16.0,5.0,25.002,0.5
25%,12.972,2.352,27.9045,53.1255,22.3625,4.36075,38.0285,0.633,87.0,13.35775,41.32,2.937
50%,29.232,42.146,40.2865,69.961,32.0195,6.026,48.142,0.8005,103.0,21.4555,57.847,5.297
75%,41.008,103.82,52.43625,87.2565,41.36425,7.71525,58.2585,0.969,121.0,29.68825,74.23475,7.662
max,60.17,174.763,115.683,161.81,90.019,16.559,103.016,1.832,231.0,37.998,89.997,9.999
