In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Loading the dataset

In [3]:
weatherData = pd.read_csv("../data/GlobalWeatherRepository.csv")
df = weatherData.copy()

In [4]:
df.head(10)

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,2024-05-16 13:15,26.6,79.8,Partly Cloudy,...,8.4,26.6,1,1,04:50 AM,06:50 PM,12:12 PM,01:11 AM,Waxing Gibbous,55
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1715849100,2024-05-16 10:45,19.0,66.2,Partly cloudy,...,1.1,2.0,1,1,05:21 AM,07:54 PM,12:58 PM,02:14 AM,Waxing Gibbous,55
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,2024-05-16 09:45,23.0,73.4,Sunny,...,10.4,18.4,1,1,05:40 AM,07:50 PM,01:15 PM,02:14 AM,Waxing Gibbous,55
3,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,1715849100,2024-05-16 10:45,6.3,43.3,Light drizzle,...,0.7,0.9,1,1,06:31 AM,09:11 PM,02:12 PM,03:31 AM,Waxing Gibbous,55
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,2024-05-16 09:45,26.0,78.8,Partly cloudy,...,183.4,262.3,5,10,06:12 AM,05:55 PM,01:17 PM,12:38 AM,Waxing Gibbous,55
5,Antigua and Barbuda,Saint John's,17.12,-61.85,America/Antigua,1715849100,2024-05-16 04:45,26.0,78.8,Partly cloudy,...,1.2,4.5,1,1,05:36 AM,06:32 PM,01:05 PM,01:14 AM,Waxing Gibbous,55
6,Argentina,Buenos Aires,-34.59,-58.67,America/Argentina/Buenos_Aires,1715849100,2024-05-16 05:45,8.0,46.4,Clear,...,4.0,5.3,1,1,07:43 AM,05:59 PM,02:36 PM,01:04 AM,Waxing Gibbous,55
7,Armenia,Yerevan,40.18,44.51,Asia/Yerevan,1715849100,2024-05-16 12:45,19.0,66.2,Partly cloudy,...,0.8,0.9,1,1,05:45 AM,08:12 PM,01:17 PM,02:31 AM,Waxing Gibbous,55
8,Australia,Canberra,-35.28,149.22,Australia/Sydney,1715849100,2024-05-16 18:45,9.0,48.2,Clear,...,3.7,5.4,1,1,06:52 AM,05:07 PM,01:31 PM,No moonset,Waxing Gibbous,55
9,Austria,Vienna,48.2,16.37,Europe/Vienna,1715849100,2024-05-16 10:45,16.0,60.8,Partly cloudy,...,3.7,4.4,1,1,05:14 AM,08:29 PM,01:00 PM,02:42 AM,Waxing Gibbous,55


### Number Of Columns

In [4]:
print(f"Columns: {df.columns}")
print("==================="*5)
print(f"Number of columns:{len(df.columns)}")

Columns: Index(['country', 'location_name', 'latitude', 'longitude', 'timezone',
       'last_updated_epoch', 'last_updated', 'temperature_celsius',
       'temperature_fahrenheit', 'condition_text', 'wind_mph', 'wind_kph',
       'wind_degree', 'wind_direction', 'pressure_mb', 'pressure_in',
       'precip_mm', 'precip_in', 'humidity', 'cloud', 'feels_like_celsius',
       'feels_like_fahrenheit', 'visibility_km', 'visibility_miles',
       'uv_index', 'gust_mph', 'gust_kph', 'air_quality_Carbon_Monoxide',
       'air_quality_Ozone', 'air_quality_Nitrogen_dioxide',
       'air_quality_Sulphur_dioxide', 'air_quality_PM2.5', 'air_quality_PM10',
       'air_quality_us-epa-index', 'air_quality_gb-defra-index', 'sunrise',
       'sunset', 'moonrise', 'moonset', 'moon_phase', 'moon_illumination'],
      dtype='object')
Number of columns:41


In [5]:
df.describe

<bound method NDFrame.describe of            country     location_name  latitude  longitude         timezone  \
0      Afghanistan             Kabul   34.5200    69.1800       Asia/Kabul   
1          Albania            Tirana   41.3300    19.8200    Europe/Tirane   
2          Algeria           Algiers   36.7600     3.0500   Africa/Algiers   
3          Andorra  Andorra La Vella   42.5000     1.5200   Europe/Andorra   
4           Angola            Luanda   -8.8400    13.2300    Africa/Luanda   
...            ...               ...       ...        ...              ...   
56901    Venezuela           Caracas   10.5000   -66.9167  America/Caracas   
56902      Vietnam             Hanoi   21.0333   105.8500     Asia/Bangkok   
56903        Yemen             Sanaa   15.3547    44.2067        Asia/Aden   
56904       Zambia            Lusaka  -15.4167    28.2833    Africa/Lusaka   
56905     Zimbabwe            Harare  -17.8178    31.0447    Africa/Harare   

       last_updated_epoch    

### Checking the datatype of columns

In [6]:
number_cols = df.select_dtypes(include='number')
categorical_cols = df.select_dtypes(include='object')

print(f"Number of numeric columns: {number_cols.shape[1]}")
print(f"Number of categorical columns: {categorical_cols.shape[1]}")

Number of numeric columns: 30
Number of categorical columns: 11


### Checking for null values

In [7]:
missing_values = df.isna().sum().sum()
print(f"Number of missing values: {missing_values}")
if missing_values > 0:
    print("There are missing values in the dataset")
else:
    print("DataSet contains no missing values")

Number of missing values: 0
DataSet contains no missing values


### Checking for duplicates

In [8]:
duplicates = df.duplicated().sum()

if duplicates > 0:
    print(f"Duplicates present in the Dataset \nTotal Number of Duplicated rows are {duplicates}")
else:
    print("No duplicates are present in the Dataset")

No duplicates are present in the Dataset


### Finding the country names which are not in English

In [9]:
df[~df['country'].str.match(r"^[A-Za-z\s'-]+$")]

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
11474,Malásia,Ivory Ivory Ban,4.63,118.37,Asia/Kuching,1720875600,2024-07-13 21:00,24.3,75.7,Light rain shower,...,1.8,2.3,1,1,06:01 AM,06:23 PM,11:28 AM,11:43 PM,First Quarter,41
11484,كولومبيا,Costa Rica,6.43,-70.92,America/Bogota,1720875600,2024-07-13 08:00,23.1,73.5,Mist,...,0.5,0.5,1,1,05:23 AM,06:02 PM,11:25 AM,11:30 PM,First Quarter,41
11504,Гватемала,New Guatemala,14.62,-90.53,America/Guatemala,1720875600,2024-07-13 07:00,15.9,60.6,Mist,...,40.8,68.8,3,4,05:41 AM,06:35 PM,11:54 AM,11:52 PM,First Quarter,41
11526,Польша,Beirut,51.12,17.53,Europe/Warsaw,1720875600,2024-07-13 15:00,24.2,75.5,Cloudy,...,2.5,2.6,1,1,04:50 AM,09:00 PM,12:56 PM,11:51 PM,First Quarter,41
11540,Polônia,Moldova,53.33,21.62,Europe/Warsaw,1720875600,2024-07-13 15:00,27.7,81.9,Partly Cloudy,...,2.5,2.6,1,1,04:22 AM,08:56 PM,12:42 PM,11:31 PM,First Quarter,41
11547,Турция,Yaren,39.55,27.62,Europe/Istanbul,1720875600,2024-07-13 16:00,34.0,93.2,Partly cloudy,...,11.3,14.2,1,1,05:54 AM,08:37 PM,01:02 PM,12:06 AM,First Quarter,41
11577,Südkorea,Seoul,37.57,127.0,Asia/Seoul,1720875600,2024-07-13 22:00,26.1,79.0,Clear,...,70.2,74.9,4,9,05:22 AM,07:54 PM,12:07 PM,11:45 PM,First Quarter,41
11581,Bélgica,Bern,51.1,4.75,Europe/Brussels,1720875600,2024-07-13 15:00,17.4,63.3,Partly cloudy,...,1.8,3.4,1,1,05:49 AM,09:23 PM,01:34 PM,12:24 AM,First Quarter,41
11588,Turkménistan,Krasnyy Turkmenistan,37.7,65.37,Asia/Ashgabat,1720875600,2024-07-13 18:00,37.8,100.1,Sunny,...,6.7,32.6,1,1,05:28 AM,08:01 PM,12:23 PM,11:55 PM,First Quarter,41
11592,火鸡,-Kingdom,38.85,34.65,Europe/Istanbul,1720875600,2024-07-13 16:00,29.7,85.4,Sunny,...,3.0,3.4,1,1,05:28 AM,08:07 PM,12:32 PM,11:59 PM,First Quarter,41


In [10]:
df['country'] = df['country'].apply(lambda x: 'Malaysia' if x == 'Malásia' else x)
df['country'] = df['country'].apply(lambda x: 'Colombia' if x == 'كولومبيا' else x)
df['country'] = df['country'].apply(lambda x: 'Guatemala' if x == 'Гватемала' else x)
df['country'] = df['country'].apply(lambda x: 'Poland' if x == 'Польша' else x)
df['country'] = df['country'].apply(lambda x: 'Poland' if x == 'Polônia' else x)
df['country'] = df['country'].apply(lambda x: 'Turkey' if x == 'Турция' else x)
df['country'] = df['country'].apply(lambda x: 'South Korea' if x == 'Südkorea' else x)
df['country'] = df['country'].apply(lambda x: 'Belgium' if x == 'Bélgica' else x)
df['country'] = df['country'].apply(lambda x: 'Turkmenistan' if x == 'Turkménistan' else x)
df['country'] = df['country'].apply(lambda x: 'Turkey' if x == '火鸡' else x)

In [11]:
df[~df['country'].str.match(r"^[A-Za-z\s'-]+$")]

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination


In [12]:
print("Number of Unique Countries:",df['country'].nunique())
print("Most Occuring Country in this dataset: ",max(df['country']))
print("Least Country in this dataset: ",min(df['country']))


Number of Unique Countries: 200
Most Occuring Country in this dataset:  Zimbabwe
Least Country in this dataset:  Afghanistan


### Drop unnecessary columns

In [13]:

columns_to_keep = [
    'country', 'location_name', 'latitude', 'longitude', 'timezone',
    'last_updated', 'temperature_celsius', 'condition_text', 'wind_kph',
    'wind_degree', 'wind_direction', 'pressure_mb', 'precip_mm', 'humidity',
    'cloud', 'feels_like_celsius', 'visibility_km', 'uv_index', 'gust_kph',
    'air_quality_Carbon_Monoxide', 'air_quality_Ozone',
    'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide',
    'air_quality_PM2.5', 'air_quality_PM10'
]

df = df[columns_to_keep]
os.makedirs("../data/", exist_ok=True)
df.to_csv("../data/cleaned_weather.csv", index=False)