In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load the data

In [111]:
df  = pd.read_csv('data/wather_data.csv')

In [99]:
df.shape

(100, 6)

***Check basic information***

* Get an overview of columns, data types, and non-null counts.

In [100]:
df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 80 non-null     object 
 1   city                 100 non-null    object 
 2   temperature_celsius  42 non-null     float64
 3   humidity_percent     47 non-null     float64
 4   wind_speed_kph       55 non-null     float64
 5   weather_condition    84 non-null     object 
dtypes: float64(3), object(3)
memory usage: 4.8+ KB


* Summary statistics for numerical columns.

In [101]:
df.describe()

Unnamed: 0,temperature_celsius,humidity_percent,wind_speed_kph
count,42.0,47.0,55.0
mean,8.4,58.978723,14.352727
std,6.713147,17.352471,8.906674
min,-4.1,30.0,0.8
25%,3.75,44.5,7.2
50%,8.4,60.0,14.2
75%,13.65,74.5,21.1
max,19.3,89.0,29.9


- Preview the first few (four) rows.

In [102]:
df.head(4)          

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,01/02/2023,New York,,65.0,12.0,Cloudy
2,03-01-2023,New York,7.0,,8.0,Rainy
3,,London,8.0,70.0,15.0,Unknown


- Total missing values per column

In [103]:

df.isnull().sum()

date                   20
city                    0
temperature_celsius    58
humidity_percent       53
wind_speed_kph         45
weather_condition      16
dtype: int64

- Total missing values in all the columns

In [104]:
int(df.isnull().sum().sum())

192

In [115]:
unique_dates = df['date'].unique()
print(unique_dates)


['2023-01-01' '01/02/2023' '03-01-2023' nan '2023-01-02' '01/03/2023'
 '01-25-2023' '2023-01-12' '14-01-2023' '01-07-2023' '06/01/2023'
 '01/15/2023' '14.01.2023' '25/01/2023' '01/19/2023' '17/01/2023'
 '2023-01-05' '07/01/2023' '2023-01-26' '01/16/2023' '01/26/2023'
 '18-01-2023' '25.01.2023' '07.01.2023' '01/28/2023' '2023-01-17'
 '27.01.2023' '2023-01-29' '01-21-2023' '26.01.2023' '21.01.2023'
 '01/10/2023' '17.01.2023' '01/08/2023' '12.01.2023' '21/01/2023'
 '22/01/2023' '01-01-2023' '18.01.2023' '02-01-2023' '30.01.2023'
 '01-27-2023' '2023-01-08' '29.01.2023' '04-01-2023' '01/23/2023'
 '01-15-2023' '01-24-2023' '01-05-2023' '01-16-2023' '20-01-2023'
 '20.01.2023' '24.01.2023' '16/01/2023' '24/01/2023' '27-01-2023'
 '06-01-2023' '20/01/2023' '01-06-2023' '09-01-2023' '2023-01-11'
 '15/01/2023' '01/12/2023']


- Fill missing temperatures with city-wise average

In [105]:
df["temperature_celsius"] = df.groupby('city')['temperature_celsius'].transform(lambda x: x.fillna(x.mean()))

In [106]:
df.shape

(100, 6)

- Drop rows with missing dates

In [107]:
df.dropna(subset=['date'], inplace=True)

In [108]:
df.head()

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,01/02/2023,New York,8.923529,65.0,12.0,Cloudy
2,03-01-2023,New York,7.0,,8.0,Rainy
4,2023-01-02,London,6.0,75.0,20.0,Snowy
5,01/03/2023,London,9.9125,80.0,18.0,Cloudy


- Convert date to standardized format

In [None]:
import pandas as pd
import numpy as np

# Sample Data (Replace with your actual data loading)
df = pd.read_csv("your_file.csv")

# Standardize Date Format using Multiple Formats
def standardize_date(date_str):
    if pd.isna(date_str):  # If date is missing, return NaT
        return np.nan
    
    for fmt in ("%Y-%m-%d", "%d-%m-%Y", "%d.%m.%Y", "%m/%d/%Y", "%m-%d-%Y", "%d/%m/%Y"):
        try:
            return pd.to_datetime(date_str, format=fmt).strftime('%Y-%m-%d')
        except ValueError:
            continue
    return np.nan  # If no format matches

# Apply the standardization
df['date'] = df['date'].apply(standardize_date)

print(df['date'].unique())


In [110]:
df.head()

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,2023-02-01,New York,8.923529,65.0,12.0,Cloudy
2,,New York,7.0,,8.0,Rainy
4,2023-01-02,London,6.0,75.0,20.0,Snowy
5,2023-03-01,London,9.9125,80.0,18.0,Cloudy


- Remove rows with unknown weather condition

In [79]:
df = df[df['weather_condition'].notna() & (df['weather_condition'] != "Unknown")]

In [80]:
df.shape

(53, 6)

In [81]:
df.head()

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,,New York,8.923529,65.0,12.0,Cloudy
2,,New York,7.0,,8.0,Rainy
4,2023-01-02,London,6.0,75.0,20.0,Snowy
5,,London,9.9125,80.0,18.0,Cloudy


In [43]:
df.shape

(80, 6)

In [82]:
df.isnull().sum()

date                   43
city                    0
temperature_celsius     0
humidity_percent       26
wind_speed_kph         21
weather_condition       0
dtype: int64