In [1]:
import opendatasets as od
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import os
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [2]:
od.download('https://www.kaggle.com/datasets/berkeleyearth/climate-change-earth-surface-temperature-data')

Skipping, found downloaded files in ".\climate-change-earth-surface-temperature-data" (use force=True to force download)


In [3]:
os.listdir('climate-change-earth-surface-temperature-data')

['GlobalLandTemperaturesByCity.csv',
 'GlobalLandTemperaturesByCountry.csv',
 'GlobalLandTemperaturesByMajorCity.csv',
 'GlobalLandTemperaturesByState.csv',
 'GlobalTemperatures.csv']

### City Temperatures

In [4]:
temp_by_city = pd.read_csv('./climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByCity.csv',parse_dates=["dt"])

In [5]:
temp_by_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   dt                             datetime64[ns]
 1   AverageTemperature             float64       
 2   AverageTemperatureUncertainty  float64       
 3   City                           object        
 4   Country                        object        
 5   Latitude                       object        
 6   Longitude                      object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 459.2+ MB


In [6]:
temp_by_city.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [7]:
temp_by_city.shape

(8599212, 7)

In [8]:
temp_by_city.isna().sum()

dt                                    0
AverageTemperature               364130
AverageTemperatureUncertainty    364130
City                                  0
Country                               0
Latitude                              0
Longitude                             0
dtype: int64

### City Temperatures

In [9]:
temp_by_country = pd.read_csv('./climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByCountry.csv',parse_dates=["dt"])

In [10]:
temp_by_country.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


In [11]:
temp_by_country.shape

(577462, 4)

In [12]:
temp_by_country.isna().sum()

dt                                   0
AverageTemperature               32651
AverageTemperatureUncertainty    31912
Country                              0
dtype: int64

### Major City Temperatures

In [13]:
temp_by_major_city = pd.read_csv('./climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByMajorCity.csv',parse_dates=["dt"])

In [14]:
temp_by_major_city.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [15]:
temp_by_major_city.shape

(239177, 7)

In [16]:
temp_by_major_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239177 entries, 0 to 239176
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   dt                             239177 non-null  datetime64[ns]
 1   AverageTemperature             228175 non-null  float64       
 2   AverageTemperatureUncertainty  228175 non-null  float64       
 3   City                           239177 non-null  object        
 4   Country                        239177 non-null  object        
 5   Latitude                       239177 non-null  object        
 6   Longitude                      239177 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 12.8+ MB


In [17]:
temp_by_major_city.isna().sum()

dt                                   0
AverageTemperature               11002
AverageTemperatureUncertainty    11002
City                                 0
Country                              0
Latitude                             0
Longitude                            0
dtype: int64

### State Temperatures

In [18]:
temp_by_state = pd.read_csv('./climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByState.csv',parse_dates=["dt"])

In [19]:
temp_by_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645675 entries, 0 to 645674
Data columns (total 5 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   dt                             645675 non-null  datetime64[ns]
 1   AverageTemperature             620027 non-null  float64       
 2   AverageTemperatureUncertainty  620027 non-null  float64       
 3   State                          645675 non-null  object        
 4   Country                        645675 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 24.6+ MB


In [20]:
temp_by_state.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [21]:
temp_by_state.shape

(645675, 5)

In [22]:
temp_by_state.isna().sum()

dt                                   0
AverageTemperature               25648
AverageTemperatureUncertainty    25648
State                                0
Country                              0
dtype: int64

### Global Temperatures

In [23]:
temp_by_global = pd.read_csv('./climate-change-earth-surface-temperature-data/GlobalTemperatures.csv',parse_dates=["dt"])

In [24]:
temp_by_global.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


In [4]:
# Print a message
print("Hello, Airflow!")

# Calculate the sum of two numbers
a = 10
b = 20
sum_result = a + b
print(f"The sum of {a} and {b} is {sum_result}")


Hello, Airflow!
The sum of 10 and 20 is 30


In [5]:
# Inside your notebook cells
print("Starting data processing...", flush=True)
# Your data processing code here
print("Data processing complete.", flush=True)


Starting data processing...
Data processing complete.


In [6]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)  # Set the desired logging level

# Example of logging an informational message
logging.info("This message will appear in Airflow's logs if the logging level is INFO or lower.")


INFO:root:This message will appear in Airflow's logs if the logging level is INFO or lower.
