In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../trainingData/GlobalLandTemperaturesByCity.csv')

In [None]:
print(df.shape)
print(df.isnull().sum())

In [None]:
df.dropna(inplace=True)
df['dt'] = pd.to_datetime(df['dt'])

# Extract features like day of the year, month, etc.
df['day_of_year'] = df['dt'].dt.dayofyear
df['month'] = df['dt'].dt.month
df['year'] = df['dt'].dt.year
df['day_of_week'] = df['dt'].dt.dayofweek  # Monday=0, Sunday=6
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame containing the dataset

# Reduce the dataset to every 4th entry
reduced_df = df.iloc[::4]

# Now, work with reduced_df instead of df
X = reduced_df.drop(['dt', 'AverageTemperature'], axis=1)  # Features
y = reduced_df['AverageTemperature']  # Target variable

# Split the dataset into training and testing sets
# Here, we are using 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, you can print the shapes of the resulting datasets to verify the split
print("Training set - Features shape:", X_train.shape)
print("Training set - Target shape:", y_train.shape)
print("Testing set - Features shape:", X_test.shape)
print("Testing set - Target shape:", y_test.shape)

In [None]:
# Drop 'City' and 'Country' columns
X_train = X_train.drop(['City', 'Country'], axis=1)
X_test = X_test.drop(['City', 'Country'], axis=1)
print("Training set - Features shape:", X_train.shape)
print("Training set - Target shape:", y_train.shape)
print("Testing set - Features shape:", X_test.shape)
print("Testing set - Target shape:", y_test.shape)

In [None]:
# Define a function to convert latitude and longitude values to numeric format
def convert_to_numeric(value):
    if isinstance(value, float):  # Check if value is already numeric
        return value
    direction = 1  # Assume positive direction by default
    if value.endswith('S') or value.endswith('W'):
        direction = -1  # Negative direction for South and West
    return direction * float(value[:-1])  # Convert value to float and apply direction


# Convert 'Latitude' and 'Longitude' columns to numeric format
X_train['Latitude'] = X_train['Latitude'].apply(convert_to_numeric)
X_train['Longitude'] = X_train['Longitude'].apply(convert_to_numeric)

X_test['Latitude'] = X_test['Latitude'].apply(convert_to_numeric)
X_test['Longitude'] = X_test['Longitude'].apply(convert_to_numeric)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

# Optionally, you can print the trained model to see its parameters
print("Trained Random Forest Model:", rf_model)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on the testing data
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Calculate RMSE from MSE

# Print evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
