In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Reading the CSV

In [9]:
df_raw = pd.read_csv(r'./delhi_pm25_features.csv')

In [13]:
df_raw.shape

(80709, 21)

In [12]:
df_raw.head()

Unnamed: 0,pm25_hourly,n_obs_hour,latitude,longitude,_raw_item,temperature,relative_humidity,wind_speed,hour,sin_hour,...,weekday,pm25_lag_1h,pm25_lag_3h,pm25_lag_24h,pm25_roll_mean_24h,location_id,lat,lon,location_id_code,is_holiday
0,297.0,,28.63576,77.22445,"{'coordinates': None, 'coverage': {'datetimeFr...",,,,19,-0.965926,...,2,,,,297.0,8118,28.63576,77.22445,0,False
1,307.0,,28.63576,77.22445,"{'coordinates': None, 'coverage': {'datetimeFr...",,,,20,-0.866025,...,2,297.0,,,302.0,8118,28.63576,77.22445,0,False
2,309.0,,28.63576,77.22445,"{'coordinates': None, 'coverage': {'datetimeFr...",,,,21,-0.707107,...,2,307.0,,,304.333333,8118,28.63576,77.22445,0,False
3,315.0,,28.63576,77.22445,"{'coordinates': None, 'coverage': {'datetimeFr...",,,,22,-0.5,...,2,309.0,297.0,,307.0,8118,28.63576,77.22445,0,False
4,330.0,,28.63576,77.22445,"{'coordinates': None, 'coverage': {'datetimeFr...",,,,23,-0.258819,...,2,315.0,307.0,,311.6,8118,28.63576,77.22445,0,False


In [16]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80709 entries, 0 to 80708
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   pm25_hourly         75899 non-null  float64
 1   n_obs_hour          0 non-null      float64
 2   latitude            80709 non-null  float64
 3   longitude           80709 non-null  float64
 4   _raw_item           80709 non-null  object 
 5   temperature         0 non-null      float64
 6   relative_humidity   0 non-null      float64
 7   wind_speed          0 non-null      float64
 8   hour                80709 non-null  int64  
 9   sin_hour            80709 non-null  float64
 10  cos_hour            80709 non-null  float64
 11  weekday             80709 non-null  int64  
 12  pm25_lag_1h         75898 non-null  float64
 13  pm25_lag_3h         75896 non-null  float64
 14  pm25_lag_24h        75875 non-null  float64
 15  pm25_roll_mean_24h  78852 non-null  float64
 16  loca

In [None]:
#Number of unique values in dataframe
df_raw.nunique()

pm25_hourly            1313
n_obs_hour                0
latitude                  1
longitude                 1
_raw_item             80709
temperature               0
relative_humidity         0
wind_speed                0
hour                     24
sin_hour                 21
cos_hour                 22
weekday                   7
pm25_lag_1h            1313
pm25_lag_3h            1313
pm25_lag_24h           1313
pm25_roll_mean_24h    20855
location_id               1
lat                       1
lon                       1
location_id_code          1
is_holiday                2
dtype: int64

In [None]:
#Null values in dataset
df_raw.isnull().sum() 

pm25_hourly            4810
n_obs_hour            80709
latitude                  0
longitude                 0
_raw_item                 0
temperature           80709
relative_humidity     80709
wind_speed            80709
hour                      0
sin_hour                  0
cos_hour                  0
weekday                   0
pm25_lag_1h            4811
pm25_lag_3h            4813
pm25_lag_24h           4834
pm25_roll_mean_24h     1857
location_id               0
lat                       0
lon                       0
location_id_code          0
is_holiday                0
dtype: int64

In [19]:
# Percentage of null values
(df_raw.isnull().sum()/(len(df_raw)))*100

pm25_hourly             5.959682
n_obs_hour            100.000000
latitude                0.000000
longitude               0.000000
_raw_item               0.000000
temperature           100.000000
relative_humidity     100.000000
wind_speed            100.000000
hour                    0.000000
sin_hour                0.000000
cos_hour                0.000000
weekday                 0.000000
pm25_lag_1h             5.960921
pm25_lag_3h             5.963399
pm25_lag_24h            5.989419
pm25_roll_mean_24h      2.300859
location_id             0.000000
lat                     0.000000
lon                     0.000000
location_id_code        0.000000
is_holiday              0.000000
dtype: float64

In [None]:
#Statistics
df_raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pm25_hourly,75899.0,73.427509,236.9581,-999.0,33.0,64.0,138.0,1990.0
n_obs_hour,0.0,,,,,,,
latitude,80709.0,28.63576,3.552736e-15,28.63576,28.63576,28.63576,28.63576,28.63576
longitude,80709.0,77.22445,0.0,77.22445,77.22445,77.22445,77.22445,77.22445
temperature,0.0,,,,,,,
relative_humidity,0.0,,,,,,,
wind_speed,0.0,,,,,,,
hour,80709.0,11.512149,6.924664,0.0,6.0,12.0,18.0,23.0
sin_hour,80709.0,-0.002836,0.705775,-1.0,-0.707107,0.0,0.707107,1.0
cos_hour,80709.0,-0.000555,0.7084389,-1.0,-0.707107,-1.83697e-16,0.707107,1.0


In [22]:
# Seperating Categorical and numerical columns
cat_cols = df_raw.select_dtypes(include = ['object']).columns
num_cols = df_raw.select_dtypes(include = np.number).columns.tolist()
print("Categorical Variables:")
print(cat_cols)
print("Numerical Variables:")
print(num_cols)

Categorical Variables:
Index(['_raw_item'], dtype='object')
Numerical Variables:
['pm25_hourly', 'n_obs_hour', 'latitude', 'longitude', 'temperature', 'relative_humidity', 'wind_speed', 'hour', 'sin_hour', 'cos_hour', 'weekday', 'pm25_lag_1h', 'pm25_lag_3h', 'pm25_lag_24h', 'pm25_roll_mean_24h', 'location_id', 'lat', 'lon', 'location_id_code']
