# Data Preprocessing for Global Temperatures Dataset

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset for global temps
df_global = pd.read_csv('/home/leslie-jordan/Downloads/GlobalTemperatures.csv')

# Drop columns with a substantial amount of missing data
df_global_cleaned = df_global.drop(columns=[
    'LandMaxTemperature', 
    'LandMaxTemperatureUncertainty', 
    'LandMinTemperature', 
    'LandMinTemperatureUncertainty', 
    'LandAndOceanAverageTemperature', 
    'LandAndOceanAverageTemperatureUncertainty'
])

# Fill missing values in the remaining columns with the mean
df_global_cleaned['LandAverageTemperature'].fillna(df_global_cleaned['LandAverageTemperature'].mean(), inplace=True)
df_global_cleaned['LandAverageTemperatureUncertainty'].fillna(df_global_cleaned['LandAverageTemperatureUncertainty'].mean(), inplace=True)

# Convert date column to datetime
df_global_cleaned['dt'] = pd.to_datetime(df_global_cleaned['dt'])

# Create year and month features
df_global_cleaned['Year'] = df_global_cleaned['dt'].dt.year
df_global_cleaned['Month'] = df_global_cleaned['dt'].dt.month

# Drop the original date column
df_global_cleaned = df_global_cleaned.drop(columns=['dt'])

# Standardize the numerical features
scaler = StandardScaler()
df_global_cleaned[['LandAverageTemperature', 'LandAverageTemperatureUncertainty']] = scaler.fit_transform(
    df_global_cleaned[['LandAverageTemperature', 'LandAverageTemperatureUncertainty']]
)

# Split the data into training and testing sets
X = df_global_cleaned.drop(columns=['LandAverageTemperature'])
y = df_global_cleaned['LandAverageTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((2553, 3), (639, 3), (2553,), (639,))

In [7]:
#Now, let's do the same for countries
#Load the dataset for global temps
df_country = pd.read_csv('/home/leslie-jordan/Downloads/GlobalLandTemperaturesByCountry.csv')

# Fill missing values in the numerical columns with the mean
df_country['AverageTemperature'].fillna(df_country['AverageTemperature'].mean(), inplace=True)
df_country['AverageTemperatureUncertainty'].fillna(df_country['AverageTemperatureUncertainty'].mean(), inplace=True)

# Convert date column to datetime
df_country['dt'] = pd.to_datetime(df_country['dt'])

# Create year and month features
df_country['Year'] = df_country['dt'].dt.year
df_country['Month'] = df_country['dt'].dt.month

# Drop the original date column
df_country = df_country.drop(columns=['dt'])

# Encode the categorical feature
df_country = pd.get_dummies(df_country, columns=['Country'], drop_first=True)

# Standardize the numerical features
scaler = StandardScaler()
df_country[['AverageTemperature', 'AverageTemperatureUncertainty']] = scaler.fit_transform(
    df_country[['AverageTemperature', 'AverageTemperatureUncertainty']]
)

# Split the data into training and testing sets
X = df_country.drop(columns=['AverageTemperature'])
y = df_country['AverageTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((461969, 245), (115493, 245), (461969,), (115493,))

In [8]:
#Amazing! For the last pre-processing step, let's pre-process the cities.

#Load the cities data set.
df_city = pd.read_csv('/home/leslie-jordan/Downloads/GlobalLandTemperaturesByCity.csv')
df_city.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [9]:
df_city.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

In [10]:
# Fill missing values in the numerical columns with the mean
df_city['AverageTemperature'].fillna(df_city['AverageTemperature'].mean(), inplace=True)
df_city['AverageTemperatureUncertainty'].fillna(df_city['AverageTemperatureUncertainty'].mean(), inplace=True)

# Convert date column to datetime, coercing errors
df_city['dt'] = pd.to_datetime(df_city['dt'], errors='coerce')

# Check for non-datetime values
non_datetime_values = df_city[df_city['dt'].isnull()]
print("Non-datetime values in dt column:")
print(non_datetime_values)

# Drop rows with non-datetime values in dt column
df_city = df_city.dropna(subset=['dt'])

# Create year and month features
df_city['Year'] = df_city['dt'].dt.year
df_city['Month'] = df_city['dt'].dt.month

# Drop the original date column
df_city = df_city.drop(columns=['dt'])

# Encode the categorical features using Label Encoding for City and Country
label_encoder_city = LabelEncoder()
df_city['City'] = label_encoder_city.fit_transform(df_city['City'])

label_encoder_country = LabelEncoder()
df_city['Country'] = label_encoder_country.fit_transform(df_city['Country'])

# Convert Latitude and Longitude to string
df_city['Latitude'] = df_city['Latitude'].astype(str)
df_city['Longitude'] = df_city['Longitude'].astype(str)

# Standardize the numerical features
scaler = StandardScaler()
df_city[['AverageTemperature', 'AverageTemperatureUncertainty']] = scaler.fit_transform(
    df_city[['AverageTemperature', 'AverageTemperatureUncertainty']]
)

# Split the data into training and testing sets
X = df_city.drop(columns=['AverageTemperature'])
y = df_city['AverageTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Non-datetime values in dt column:
Empty DataFrame
Columns: [dt, AverageTemperature, AverageTemperatureUncertainty, City, Country, Latitude, Longitude]
Index: []


((6879369, 7), (1719843, 7), (6879369,), (1719843,))