In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head(5)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1548892800,2019-01-31 00:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-17.57,1207.0,-21.6,-24.57,...,20.06,,,,,40,600,Snow,light snow,13n
1,1548896400,2019-01-31 01:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-18.42,402.0,-22.58,-25.42,...,19.0,,,,,40,600,Snow,light snow,13n
2,1548900000,2019-01-31 02:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-19.24,3218.0,-23.07,-26.24,...,20.6,,,,,100,600,Snow,light snow,13n
3,1548903600,2019-01-31 03:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-19.4,10000.0,-23.83,-26.4,...,18.0,,,,,20,801,Clouds,few clouds,02n
4,1548907200,2019-01-31 04:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-19.71,10000.0,-24.44,-26.71,...,17.0,,,,,0,800,Clear,sky is clear,01n


In [4]:
df.shape

(44256, 28)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44256 entries, 0 to 44255
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dt                   44256 non-null  int64  
 1   dt_iso               44256 non-null  object 
 2   timezone             44256 non-null  int64  
 3   city_name            44256 non-null  object 
 4   lat                  44256 non-null  float64
 5   lon                  44256 non-null  float64
 6   temp                 44256 non-null  float64
 7   visibility           35094 non-null  float64
 8   dew_point            44256 non-null  float64
 9   feels_like           44256 non-null  float64
 10  temp_min             44256 non-null  float64
 11  temp_max             44256 non-null  float64
 12  pressure             44256 non-null  int64  
 13  sea_level            0 non-null      float64
 14  grnd_level           0 non-null      float64
 15  humidity             44256 non-null 

In [6]:
categorical_columns=df.select_dtypes(include=["object"]).columns
#columns with datatypes as int and float are being considered as categorical

categorical_columns = [col for col in categorical_columns if col != 'dt_iso']
print(categorical_columns)

['city_name', 'weather_main', 'weather_description', 'weather_icon']


In [7]:
df['city_name'].unique()

array(['Toronto'], dtype=object)

In [8]:
df['weather_main'].unique()

array(['Snow', 'Clouds', 'Clear', 'Mist', 'Fog', 'Rain', 'Thunderstorm',
       'Haze'], dtype=object)

In [9]:
df['weather_description'].unique()

array(['light snow', 'few clouds', 'sky is clear', 'scattered clouds',
       'broken clouds', 'overcast clouds', 'mist', 'fog', 'light rain',
       'moderate rain', 'snow', 'heavy snow', 'heavy intensity rain',
       'thunderstorm with light rain', 'thunderstorm with rain',
       'thunderstorm with heavy rain', 'thunderstorm', 'haze',
       'very heavy rain'], dtype=object)

In [10]:
df['weather_icon'].unique()

array(['13n', '02n', '01n', '01d', '03d', '04d', '04n', '50n', '50d',
       '10d', '10n', '03n', '13d', '02d', '11n', '11d'], dtype=object)

In [13]:
# #Using OneHotEncoder to encode the categorical columns
onehot_encoder = OneHotEncoder(sparse=False, drop='first') 
for column in categorical_columns:
    encoded_data = onehot_encoder.fit_transform(df[[column]])
    encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out([column]))
    df = pd.concat([df, encoded_df], axis=1)

df.drop(categorical_columns, axis=1, inplace=True)
df.head()





Unnamed: 0,dt,dt_iso,timezone,lat,lon,temp,visibility,dew_point,feels_like,temp_min,...,weather_icon_04d,weather_icon_04n,weather_icon_10d,weather_icon_10n,weather_icon_11d,weather_icon_11n,weather_icon_13d,weather_icon_13n,weather_icon_50d,weather_icon_50n
0,1548892800,2019-01-31 00:00:00 +0000 UTC,-18000,43.653226,-79.383184,-17.57,1207.0,-21.6,-24.57,-18.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1548896400,2019-01-31 01:00:00 +0000 UTC,-18000,43.653226,-79.383184,-18.42,402.0,-22.58,-25.42,-19.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1548900000,2019-01-31 02:00:00 +0000 UTC,-18000,43.653226,-79.383184,-19.24,3218.0,-23.07,-26.24,-20.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1548903600,2019-01-31 03:00:00 +0000 UTC,-18000,43.653226,-79.383184,-19.4,10000.0,-23.83,-26.4,-20.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1548907200,2019-01-31 04:00:00 +0000 UTC,-18000,43.653226,-79.383184,-19.71,10000.0,-24.44,-26.71,-21.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
