In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Colab Notebooks/weather.csv'

df = pd.read_csv(file_path)
print(df.head())

       country     location_name  latitude  longitude        timezone  \
0  Afghanistan             Kabul     34.52      69.18      Asia/Kabul   
1      Albania            Tirana     41.33      19.82   Europe/Tirane   
2      Algeria           Algiers     36.76       3.05  Africa/Algiers   
3      Andorra  Andorra La Vella     42.50       1.52  Europe/Andorra   
4       Angola            Luanda     -8.84      13.23   Africa/Luanda   

   last_updated_epoch      last_updated  temperature_celsius  \
0          1715849100  2024-05-16 13:15                 26.6   
1          1715849100  2024-05-16 10:45                 19.0   
2          1715849100  2024-05-16 09:45                 23.0   
3          1715849100  2024-05-16 10:45                  6.3   
4          1715849100  2024-05-16 09:45                 26.0   

   temperature_fahrenheit condition_text  ...  air_quality_PM2.5  \
0                    79.8  Partly Cloudy  ...                8.4   
1                    66.2  Partly cloudy

In [None]:

# Step 3: Inspect Data Structure, Data Types, and Key Variables
print(" Dataset Shape:", df.shape)
print("\n Data Info:")
df.info()
print("\n Summary Statistics (numeric columns):")
display(df.describe())
print("\n Data Types:")
print(df.dtypes)

 Dataset Shape: (88273, 41)

 Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88273 entries, 0 to 88272
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       88273 non-null  object 
 1   location_name                 88273 non-null  object 
 2   latitude                      88273 non-null  float64
 3   longitude                     88273 non-null  float64
 4   timezone                      88273 non-null  object 
 5   last_updated_epoch            88273 non-null  int64  
 6   last_updated                  88273 non-null  object 
 7   temperature_celsius           88273 non-null  float64
 8   temperature_fahrenheit        88273 non-null  float64
 9   condition_text                88273 non-null  object 
 10  wind_mph                      88273 non-null  float64
 11  wind_kph                      88273 non-null  float64
 12  wind_degree        

Unnamed: 0,latitude,longitude,last_updated_epoch,temperature_celsius,temperature_fahrenheit,wind_mph,wind_kph,wind_degree,pressure_mb,pressure_in,...,gust_kph,air_quality_Carbon_Monoxide,air_quality_Ozone,air_quality_Nitrogen_dioxide,air_quality_Sulphur_dioxide,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,moon_illumination
count,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,...,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0,88273.0
mean,19.125848,22.172647,1735507000.0,22.681788,72.828923,8.292613,13.348916,170.584743,1013.966536,29.941799,...,18.752776,517.197758,63.69156,15.485259,11.320585,26.212169,54.27585,1.760504,2.77002,50.746616
std,24.470798,65.836668,11353890.0,9.091355,16.364215,8.137603,13.09395,103.107058,11.786522,0.348003,...,15.089616,860.151958,33.050899,25.964224,42.271465,41.510377,163.172827,0.98809,2.590155,35.125443
min,-41.3,-175.2,1715849000.0,-24.9,-12.8,2.2,3.6,1.0,947.0,27.96,...,3.6,-9999.0,0.0,0.0,-9999.0,0.168,-1848.15,1.0,1.0,0.0
25%,3.75,-6.8361,1725710000.0,18.0,64.4,4.0,6.5,82.0,1010.0,29.83,...,10.7,234.95,43.0,1.295,0.8,7.03,10.49,1.0,1.0,16.0
50%,17.25,23.3167,1735555000.0,25.0,77.0,6.9,11.2,164.0,1013.0,29.92,...,16.2,327.1,61.0,4.44,2.405,14.985,22.015,1.0,2.0,52.0
75%,40.4,50.58,1745313000.0,28.4,83.1,11.4,18.4,257.0,1017.0,30.05,...,24.8,505.05,81.0,17.205,9.065,30.34,46.99,2.0,3.0,86.0
max,64.15,179.22,1755074000.0,49.2,120.6,1841.2,2963.2,360.0,3006.0,88.77,...,2970.4,38879.398,480.7,427.7,521.33,1614.1,6037.29,6.0,10.0,100.0



 Data Types:
country                          object
location_name                    object
latitude                        float64
longitude                       float64
timezone                         object
last_updated_epoch                int64
last_updated                     object
temperature_celsius             float64
temperature_fahrenheit          float64
condition_text                   object
wind_mph                        float64
wind_kph                        float64
wind_degree                       int64
wind_direction                   object
pressure_mb                     float64
pressure_in                     float64
precip_mm                       float64
precip_in                       float64
humidity                          int64
cloud                             int64
feels_like_celsius              float64
feels_like_fahrenheit           float64
visibility_km                   float64
visibility_miles                float64
uv_index                  

In [None]:
# Step 4: Missing Values, Anomalies & Data Coverage (short)
print(" Missing Values:", "None " if df.isnull().sum().sum()==0 else df.isnull().sum()[df.isnull().sum()>0])
print("Humidity anomalies:", len(df.query("humidity<0 or humidity>100")),
      "| Temperature anomalies:", len(df.query("temperature_celsius<-90 or temperature_celsius>60")))
print("\nData coverage:\n", df.count())


 Missing Values: None 
Humidity anomalies: 0 | Temperature anomalies: 0

Data coverage:
 country                         88273
location_name                   88273
latitude                        88273
longitude                       88273
timezone                        88273
last_updated_epoch              88273
last_updated                    88273
temperature_celsius             88273
temperature_fahrenheit          88273
condition_text                  88273
wind_mph                        88273
wind_kph                        88273
wind_degree                     88273
wind_direction                  88273
pressure_mb                     88273
pressure_in                     88273
precip_mm                       88273
precip_in                       88273
humidity                        88273
cloud                           88273
feels_like_celsius              88273
feels_like_fahrenheit           88273
visibility_km                   88273
visibility_miles                88273

In [None]:

print(" Missing values before cleaning:")
print(df.isna().sum()[df.isna().sum() > 0])
# Handling missing/inconsistent data
df.loc[(df["temperature_celsius"] < -50) | (df["temperature_celsius"] > 60), "temperature_celsius"] = None
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna("Unknown", inplace=True)
print("\n Missing values after cleaning:")
print(df.isna().sum()[df.isna().sum() > 0])
print("\nCleaned Data (first 5 rows):")
print(df.head())


 Missing values before cleaning:
Series([], dtype: int64)

 Missing values after cleaning:
Series([], dtype: int64)

Cleaned Data (first 5 rows):
       country     location_name  latitude  longitude        timezone  \
0  Afghanistan             Kabul     34.52      69.18      Asia/Kabul   
1      Albania            Tirana     41.33      19.82   Europe/Tirane   
2      Algeria           Algiers     36.76       3.05  Africa/Algiers   
3      Andorra  Andorra La Vella     42.50       1.52  Europe/Andorra   
4       Angola            Luanda     -8.84      13.23   Africa/Luanda   

   last_updated_epoch        last_updated  temperature_celsius  \
0          1715849100 2024-05-16 13:15:00                 26.6   
1          1715849100 2024-05-16 10:45:00                 19.0   
2          1715849100 2024-05-16 09:45:00                 23.0   
3          1715849100 2024-05-16 10:45:00                  6.3   
4          1715849100 2024-05-16 09:45:00                 26.0   

   temperature_fah

In [None]:

df["last_updated"] = pd.to_datetime(df["last_updated"], errors="coerce")
# Daily data
daily_data = df.copy()
print(daily_data.head())

       country     location_name  latitude  longitude        timezone  \
0  Afghanistan             Kabul     34.52      69.18      Asia/Kabul   
1      Albania            Tirana     41.33      19.82   Europe/Tirane   
2      Algeria           Algiers     36.76       3.05  Africa/Algiers   
3      Andorra  Andorra La Vella     42.50       1.52  Europe/Andorra   
4       Angola            Luanda     -8.84      13.23   Africa/Luanda   

   last_updated_epoch        last_updated  temperature_celsius  \
0          1715849100 2024-05-16 13:15:00                 26.6   
1          1715849100 2024-05-16 10:45:00                 19.0   
2          1715849100 2024-05-16 09:45:00                 23.0   
3          1715849100 2024-05-16 10:45:00                  6.3   
4          1715849100 2024-05-16 09:45:00                 26.0   

   temperature_fahrenheit condition_text  ...  air_quality_PM2.5  \
0                    79.8  Partly Cloudy  ...                8.4   
1                    66.2  P

In [None]:
# Add year-month column
df["year_month"] = df["last_updated"].dt.to_period("M").astype(str)

# Monthly averages per country
monthly_data = (
    df.groupby(["country", "year_month"])
      .mean(numeric_only=True)
      .reset_index()
)

print(monthly_data.head())


       country year_month  latitude  longitude  last_updated_epoch  \
0  Afghanistan    2024-05     34.52      69.18        1.716478e+09   
1  Afghanistan    2024-06     34.52      69.18        1.718503e+09   
2  Afghanistan    2024-07     34.52      69.18        1.721101e+09   
3  Afghanistan    2024-08     34.52      69.18        1.723811e+09   
4  Afghanistan    2024-09     34.52      69.18        1.726442e+09   

   temperature_celsius  temperature_fahrenheit   wind_mph   wind_kph  \
0            20.305882               68.523529   7.164706  11.517647   
1            25.590000               78.043333  10.636667  17.110000   
2            31.303448               88.362069  12.886207  20.731034   
3            30.848387               87.538710  12.551613  20.196774   
4            28.216667               82.783333   7.583333  12.223333   

   wind_degree  ...   gust_kph  air_quality_Carbon_Monoxide  \
0   127.941176  ...  16.141176                  1424.094118   
1   126.833333  ... 