## Pandas

In [2]:
import pandas as pd

## DataFrame
- dictionary of list

In [4]:
info = {
    "Name":["abjd", "sxawd", "wdsad"],
    "CGPA":[4.34,7.65,9.76]
}

df = pd.DataFrame(info)
df

Unnamed: 0,Name,CGPA
0,abjd,4.34
1,sxawd,7.65
2,wdsad,9.76


In [22]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [30]:
print(df.columns)
print(df.index)

Index(['Name', 'CGPA'], dtype='object')
RangeIndex(start=0, stop=3, step=1)


## list of list(DataFrame)

In [32]:
df1 = pd.DataFrame([["abc", 67], ["sam", 88],["jon", 65]], columns = ["Name", "age"])
df1

Unnamed: 0,Name,age
0,abc,67
1,sam,88
2,jon,65


## numpy array(DataFrame)

In [33]:
import numpy as np
arr = np.array([[1,2,3],[4,5,6]])
df3 = pd.DataFrame(arr, columns = ["a", "c", "c"])
df3

Unnamed: 0,a,c,c.1
0,1,2,3
1,4,5,6


## Series Note : Capital 's'

In [14]:
s = pd.Series([23,43,54,4,32])# index
print(s)

0    23
1    43
2    54
3     4
4    32
dtype: int64


In [15]:
s1 = pd.Series([23,43,54,4,32], index=["asam", "swsqs", "saxas", "ewed", "wewx"]) # here index become level
print(s1)


asam     23
swsqs    43
saxas    54
ewed      4
wewx     32
dtype: int64


In [13]:
print(s[0]) # accessing ele like array

23


## dtype: object if we do [23,43,54,4,32, "abc"] 
- A pandas Series is **homogeneous** by design, meaning it holds a single dtype.
- When mixed data types are provided, pandas upcasts all values to object, making it technically homogeneous in dtype.
- However, logically the data is heterogeneous, so it is not suitable for numerical computation or ML models.

In [16]:
s1 = pd.Series([23,43,54,4,32, "abc"], index=["asam", "swsqs", "saxas", "ewed", "wewx", 54]) # here index become level
print(s1)

asam      23
swsqs     43
saxas     54
ewed       4
wewx      32
54       abc
dtype: object


## Vectorized Operation in pandas
- Vectorization means performing operations on entire arrays/Series at once, without using explicit loops.

In [18]:
ser1 = pd.Series([1,2,3,4])
ser2 = pd.Series([10,20,30,40])
sum = ser1+ser2
sum

0    11
1    22
2    33
3    44
dtype: int64

## Values of series are Mutable

In [19]:
series1 = pd.Series([3,4,5,6])
series1[0] = 100
series1

0    100
1      4
2      5
3      6
dtype: int64

## Size of series are Immutable

In [20]:
series1.drop(0)

1    4
2    5
3    6
dtype: int64

In [21]:
series1 # see even drop size original remains same

0    100
1      4
2      5
3      6
dtype: int64

## Read Data from csv

In [38]:
df4 = pd.read_csv("data/employee_data.csv")
df4

Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,55000
1,2,Bob,32,IT,72000
2,3,Charlie,28,Finance,48000
3,4,David,45,Marketing,91000
4,5,Eva,38,IT,65000
5,6,Frank,29,Finance,50000
6,7,Grace,41,HR,82000
7,8,Hannah,26,Marketing,47000
8,9,Ian,35,IT,75000
9,10,Julia,30,Finance,60000


# pandas DataFrame Quick Methods

- `df.head()`  
  Show first 5 rows (default) of the DataFrame

- `df.tail()`  
  Show last 5 rows (default) of the DataFrame

- `df.sample()`  
  Show random sample rows (default 1 row)

- `df.info()`  
  Summary: data types, non-null counts, memory usage

- `df.shape`  
  Tuple with (rows, columns)

- `df.describe()`  
  Statistical summary of numeric columns

- `df.columns`  
  List of column names

- `df.nunique()`  
  Number of unique values per column


In [44]:
df4.describe()

Unnamed: 0,ID,Age,Salary
count,10.0,10.0,10.0
mean,5.5,32.9,64500.0
std,3.02765,6.674162,15182.226451
min,1.0,25.0,47000.0
25%,3.25,28.25,51250.0
50%,5.5,31.0,62500.0
75%,7.75,37.25,74250.0
max,10.0,45.0,91000.0


## Read Data from JSON

In [48]:
df5 = pd.read_json("data/jsondata.json")
df5.head()

Unnamed: 0,employee
0,"{'id': 'E00001', 'name': 'Matthew Snyder', 'po..."
1,"{'id': 'E00002', 'name': 'David Carter', 'posi..."
2,"{'id': 'E00003', 'name': 'Carol Durham', 'posi..."
3,"{'id': 'E00004', 'name': 'Peter Johnson', 'pos..."
4,"{'id': 'E00005', 'name': 'Stacy Jacobson', 'po..."


In [3]:
df = pd.read_csv("globalAirQuality.csv")
df

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.120,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.250,82.553,26.935,9.538,23.320,0.977,84,31.833,62.783,9.650
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.230,158,23.140,89.153,8.956
4,2025-11-04 22:25:17.554219,US,New York,40.713,-74.006,21.083,66.423,38.997,6.919,45.615,1.085,97,13.632,76.499,4.017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,2025-11-19 13:25:17.554219,CH,Zurich,47.377,8.542,27.899,74.179,41.474,6.677,50.869,1.028,103,7.079,52.443,7.452
17996,2025-11-19 14:25:17.554219,CH,Zurich,47.377,8.542,2.950,47.988,42.235,2.821,35.551,0.644,105,28.734,85.678,4.496
17997,2025-11-19 15:25:17.554219,CH,Zurich,47.377,8.542,61.347,72.908,46.976,5.763,66.492,0.947,122,21.951,72.311,9.660
17998,2025-11-19 16:25:17.554219,CH,Zurich,47.377,8.542,40.722,95.152,32.957,5.524,53.193,0.868,95,24.042,31.880,2.642


In [5]:
df.describe()

Unnamed: 0,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
count,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0
mean,23.06598,37.65556,40.369131,70.152228,32.055176,6.035508,48.0651,0.800595,104.645556,21.510251,57.714351,5.28391
std,26.156536,78.600701,17.64745,24.99944,13.82068,2.45479,14.950849,0.250254,25.61607,9.509444,18.844908,2.741712
min,-37.814,-123.121,0.025,0.061,0.013,0.003,0.114,0.0,16.0,5.0,25.002,0.5
25%,12.972,2.352,27.9045,53.1255,22.3625,4.36075,38.0285,0.633,87.0,13.35775,41.32,2.937
50%,29.232,42.146,40.2865,69.961,32.0195,6.026,48.142,0.8005,103.0,21.4555,57.847,5.297
75%,41.008,103.82,52.43625,87.2565,41.36425,7.71525,58.2585,0.969,121.0,29.68825,74.23475,7.662
max,60.17,174.763,115.683,161.81,90.019,16.559,103.016,1.832,231.0,37.998,89.997,9.999


In [6]:
df.head()

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.25,82.553,26.935,9.538,23.32,0.977,84,31.833,62.783,9.65
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.23,158,23.14,89.153,8.956
4,2025-11-04 22:25:17.554219,US,New York,40.713,-74.006,21.083,66.423,38.997,6.919,45.615,1.085,97,13.632,76.499,4.017


In [7]:
df.tail()

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
17995,2025-11-19 13:25:17.554219,CH,Zurich,47.377,8.542,27.899,74.179,41.474,6.677,50.869,1.028,103,7.079,52.443,7.452
17996,2025-11-19 14:25:17.554219,CH,Zurich,47.377,8.542,2.95,47.988,42.235,2.821,35.551,0.644,105,28.734,85.678,4.496
17997,2025-11-19 15:25:17.554219,CH,Zurich,47.377,8.542,61.347,72.908,46.976,5.763,66.492,0.947,122,21.951,72.311,9.66
17998,2025-11-19 16:25:17.554219,CH,Zurich,47.377,8.542,40.722,95.152,32.957,5.524,53.193,0.868,95,24.042,31.88,2.642
17999,2025-11-19 17:25:17.554219,CH,Zurich,47.377,8.542,25.83,30.411,35.317,4.336,66.246,0.848,88,8.529,59.104,4.403


##  Selcting column

In [9]:
df["country"] # we can print single column 

0        US
1        US
2        US
3        US
4        US
         ..
17995    CH
17996    CH
17997    CH
17998    CH
17999    CH
Name: country, Length: 18000, dtype: object

## Selcting **multiple** column

In [11]:
df[["country", "city"]]


Unnamed: 0,country,city
0,US,New York
1,US,New York
2,US,New York
3,US,New York
4,US,New York
...,...,...
17995,CH,Zurich
17996,CH,Zurich
17997,CH,Zurich
17998,CH,Zurich


## Select Rows (Label & Index based) - loc & iloc


In [17]:
# data  for first raw we want then i pass the index of 1st raw
df.loc[0] ## this is the data of first raw

timestamp      2025-11-04 18:25:17.554219
country                                US
city                             New York
latitude                           40.713
longitude                         -74.006
pm25                               50.295
pm10                              108.938
no2                                27.998
so2                                 6.539
o3                                 52.568
co                                  1.096
aqi                                   108
temperature                        18.504
humidity                           70.168
wind_speed                          3.725
Name: 0, dtype: object

## Multiple raw

In [19]:
df.loc[0:3] # all are included

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.25,82.553,26.935,9.538,23.32,0.977,84,31.833,62.783,9.65
3,2025-11-04 21:25:17.554219,US,New York,40.713,-74.006,30.403,79.951,63.536,7.609,31.369,0.23,158,23.14,89.153,8.956


In [20]:
df.iloc[0:3] # ending index is excluded

Unnamed: 0,timestamp,country,city,latitude,longitude,pm25,pm10,no2,so2,o3,co,aqi,temperature,humidity,wind_speed
0,2025-11-04 18:25:17.554219,US,New York,40.713,-74.006,50.295,108.938,27.998,6.539,52.568,1.096,108,18.504,70.168,3.725
1,2025-11-04 19:25:17.554219,US,New York,40.713,-74.006,32.083,63.043,36.12,4.021,43.536,1.075,90,5.838,80.088,8.969
2,2025-11-04 20:25:17.554219,US,New York,40.713,-74.006,42.25,82.553,26.935,9.538,23.32,0.977,84,31.833,62.783,9.65
