In [25]:
import numpy as np
import pandas as pd

In [30]:
import warnings 

warnings.filterwarnings("ignore")

In [31]:
data = pd.read_csv(r"D:\Data Science\datasets\New folder\GlobalWeatherRepository.csv")

In [32]:
data

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.5200,69.1800,Asia/Kabul,1715849100,2024-05-16 13:15,26.6,79.8,Partly Cloudy,...,8.400,26.600,1,1,04:50 AM,06:50 PM,12:12 PM,01:11 AM,Waxing Gibbous,55
1,Albania,Tirana,41.3300,19.8200,Europe/Tirane,1715849100,2024-05-16 10:45,19.0,66.2,Partly cloudy,...,1.100,2.000,1,1,05:21 AM,07:54 PM,12:58 PM,02:14 AM,Waxing Gibbous,55
2,Algeria,Algiers,36.7600,3.0500,Africa/Algiers,1715849100,2024-05-16 09:45,23.0,73.4,Sunny,...,10.400,18.400,1,1,05:40 AM,07:50 PM,01:15 PM,02:14 AM,Waxing Gibbous,55
3,Andorra,Andorra La Vella,42.5000,1.5200,Europe/Andorra,1715849100,2024-05-16 10:45,6.3,43.3,Light drizzle,...,0.700,0.900,1,1,06:31 AM,09:11 PM,02:12 PM,03:31 AM,Waxing Gibbous,55
4,Angola,Luanda,-8.8400,13.2300,Africa/Luanda,1715849100,2024-05-16 09:45,26.0,78.8,Partly cloudy,...,183.400,262.300,5,10,06:12 AM,05:55 PM,01:17 PM,12:38 AM,Waxing Gibbous,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45049,Venezuela,Caracas,10.5000,-66.9167,America/Caracas,1735898400,2025-01-03 06:00,17.7,63.9,Mist,...,3.700,4.810,1,1,06:46 AM,06:18 PM,09:54 AM,09:58 PM,Waxing Crescent,12
45050,Vietnam,Hanoi,21.0333,105.8500,Asia/Bangkok,1735901100,2025-01-03 17:45,23.0,73.4,Clear,...,73.815,74.370,4,10,06:34 AM,05:28 PM,09:16 AM,08:49 PM,Waxing Crescent,9
45051,Yemen,Sanaa,15.3547,44.2067,Asia/Aden,1735901100,2025-01-03 13:45,20.0,68.1,Sunny,...,32.005,59.940,2,3,06:30 AM,05:45 PM,09:22 AM,09:12 PM,Waxing Crescent,10
45052,Zambia,Lusaka,-15.4167,28.2833,Africa/Lusaka,1735901100,2025-01-03 12:45,19.2,66.6,Light rain shower,...,20.165,20.535,2,2,05:41 AM,06:42 PM,08:50 AM,09:48 PM,Waxing Crescent,10


In [33]:
# Convert columns to datetime
date_columns = ['last_updated', 'sunrise', 'sunset', 'moonrise', 'moonset']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')


In [34]:
print(data[date_columns].isna().sum())

for col in date_columns:
    data[col].fillna(data[col].mode()[0], inplace=True)


last_updated       0
sunrise            0
sunset             0
moonrise        1548
moonset         1357
dtype: int64


In [35]:
data['last_updated_hour'] = data['last_updated'].dt.hour
data['last_updated_day'] = data['last_updated'].dt.day
data['last_updated_month'] = data['last_updated'].dt.month
data['last_updated_weekday'] = data['last_updated'].dt.weekday


In [36]:
data['daylight_duration'] = (data['sunset'] - data['sunrise']).dt.total_seconds() / 3600  # in hours
data['moonlight_duration'] = (data['moonset'] - data['moonrise']).dt.total_seconds() / 3600


In [37]:
import numpy as np

# Encode hour as cyclical feature
data['hour_sin'] = np.sin(2 * np.pi * data['last_updated_hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['last_updated_hour'] / 24)


In [38]:
data.drop(['country', 'location_name', 'timezone',
         'wind_direction', 'moon_phase'], axis=1, inplace=True)


In [39]:
# feature interactions
data['wind_power'] = data['wind_mph'] * data['wind_degree']
data['temperature_feels_diff'] = data['feels_like_celsius'] - data['temperature_celsius']


In [40]:
data['last_updated'] = pd.to_datetime(data['last_updated'])


In [41]:
# Extractimg features from 'last_updated'
data['year'] = data['last_updated'].dt.year
data['month'] = data['last_updated'].dt.month
data['day'] = data['last_updated'].dt.day
data['hour'] = data['last_updated'].dt.hour
data['minute'] = data['last_updated'].dt.minute
data['day_of_week'] = data['last_updated'].dt.dayofweek  # Monday=0, Sunday=6
data['week_of_year'] = data['last_updated'].dt.isocalendar().week


In [42]:
data.drop(columns=['last_updated','sunrise','sunset','moonrise','moonset'], axis=1, inplace=True)


In [43]:
df = data[['temperature_fahrenheit', 'feels_like_celsius', 'temperature_feels_diff',
                     'feels_like_fahrenheit', 'last_updated_day', 'moonlight_duration',
                     'day', 'moon_illumination', 'wind_power', 'daylight_duration','temperature_celsius','condition_text']]


In [44]:
df.head()

Unnamed: 0,temperature_fahrenheit,feels_like_celsius,temperature_feels_diff,feels_like_fahrenheit,last_updated_day,moonlight_duration,day,moon_illumination,wind_power,daylight_duration,temperature_celsius,condition_text
0,79.8,25.3,-1.3,77.5,16,-11.016667,16,55,2805.4,14.0,26.6,Partly Cloudy
1,66.2,19.0,0.0,66.2,16,-10.733333,16,55,2208.0,14.55,19.0,Partly cloudy
2,73.4,24.6,1.6,76.4,16,-11.016667,16,55,2632.0,14.166667,23.0,Sunny
3,43.3,3.8,-2.5,38.9,16,-10.683333,16,55,1591.0,14.666667,6.3,Light drizzle
4,78.8,28.7,2.7,83.6,16,-12.65,16,55,1215.0,11.716667,26.0,Partly cloudy


In [45]:
df.columns

Index(['temperature_fahrenheit', 'feels_like_celsius',
       'temperature_feels_diff', 'feels_like_fahrenheit', 'last_updated_day',
       'moonlight_duration', 'day', 'moon_illumination', 'wind_power',
       'daylight_duration', 'temperature_celsius', 'condition_text'],
      dtype='object')

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45054 entries, 0 to 45053
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   temperature_fahrenheit  45054 non-null  float64
 1   feels_like_celsius      45054 non-null  float64
 2   temperature_feels_diff  45054 non-null  float64
 3   feels_like_fahrenheit   45054 non-null  float64
 4   last_updated_day        45054 non-null  int32  
 5   moonlight_duration      45054 non-null  float64
 6   day                     45054 non-null  int32  
 7   moon_illumination       45054 non-null  int64  
 8   wind_power              45054 non-null  float64
 9   daylight_duration       45054 non-null  float64
 10  temperature_celsius     45054 non-null  float64
 11  condition_text          45054 non-null  object 
dtypes: float64(8), int32(2), int64(1), object(1)
memory usage: 3.8+ MB


In [47]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import joblib

In [48]:
X = df.drop(['temperature_celsius', 'condition_text'], axis=1)
y_reg = df['temperature_celsius']
y_clf = df['condition_text']

In [49]:
# Encode the categorical target
label_encoder = LabelEncoder()
y_clf_encoded = label_encoder.fit_transform(y_clf)

In [50]:
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf_encoded, test_size=0.2, random_state=42)


In [51]:
from sklearn.ensemble import RandomForestRegressor

In [52]:
# Train the regression model
regressor = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
    ])
regressor.fit(X_train, y_reg_train)


In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [56]:
y_reg_pred = regressor.predict(X_test)

# Mean Absolute Error
mae = mean_absolute_error(y_reg_test, y_reg_pred)
print(f"Mean Absolute Error: {mae}")

# Mean Squared Error
mse = mean_squared_error(y_reg_test, y_reg_pred)
print(f"Mean Squared Error: {mse}")

# R2 Score
r2 = r2_score(y_reg_test, y_reg_pred)
print(f"R2 Score: {r2}")


Mean Absolute Error: 0.0073055154811016215
Mean Squared Error: 0.0006346323382532554
R2 Score: 0.9999918113136208


In [55]:
# Train the classification model
classifier = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
classifier.fit(X_train, y_clf_train)

In [66]:
joblib.dump(regressor, 'regressor_model.pkl')
joblib.dump(classifier, 'classifier_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']