In [3]:
#Import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets as widgets
import seaborn as sns


# machine learning sklearn
from sklearn import linear_model 
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA, ARMAResults
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 

In [2]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [19]:
global_temps = pd.read_csv("GlobalLandTemperaturesByState.csv", sep=',')

# convert first column to DateTime format|
global_temps['dt'] = pd.to_datetime(global_temps['dt'])

global_temps.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [20]:
#Transform date into year

global_temps_column = global_temps['dt'].dt.year
global_temps_column 

0         1855
1         1855
2         1855
3         1855
4         1855
          ... 
645670    2013
645671    2013
645672    2013
645673    2013
645674    2013
Name: dt, Length: 645675, dtype: int64

In [21]:
#rename country column to country name

global_temp_new = global_temps.rename(columns = {"Country": "Country Name"})
global_temp_new.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country Name
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [72]:
#Adding the date into the Dataframe

global_temp_new['Year'] = global_temps_column
global_temp_new.head(1260)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country Name,Year
0,1855-05-01,25.544,1.171,Acre,Brazil,1855
1,1855-06-01,24.228,1.103,Acre,Brazil,1855
2,1855-07-01,24.371,1.044,Acre,Brazil,1855
3,1855-08-01,25.427,1.073,Acre,Brazil,1855
4,1855-09-01,25.675,1.014,Acre,Brazil,1855
...,...,...,...,...,...,...
1255,1959-12-01,26.187,0.215,Acre,Brazil,1959
1256,1960-01-01,26.454,0.337,Acre,Brazil,1960
1257,1960-02-01,26.157,0.445,Acre,Brazil,1960
1258,1960-03-01,26.418,0.316,Acre,Brazil,1960


In [45]:
global_temp_new['Year'] = global_temp_new['Year'].apply(str)

In [104]:
#rename country column to country name

# global_temp_new = global_temps.rename({"Country": "Country Name"})
# global_temp_new.head()

In [46]:
global_temp_new["Country Name"].value_counts()

Russia           254972
United States    149745
India             86664
China             68506
Canada            35358
Brazil            34328
Australia         16102
Name: Country Name, dtype: int64

In [47]:
#Read the Co2 file

Co2_emmission = pd.read_csv("API_EN.ATM.CO2E.PC_DS2_en_csv_v2_3638608.csv", skiprows=4)
                        
Co2_emmission.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Aruba,ABW,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.90606,0.922474,0.930816,0.94057,0.996033,1.04728,...,1.021646,1.031833,1.041145,0.987393,0.971016,0.959978,0.933541,,,
2,Afghanistan,AFG,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.046057,0.053589,0.073721,0.074161,0.086174,0.101285,...,0.335351,0.263716,0.234037,0.232176,0.208857,0.203328,0.200151,,,
3,Africa Western and Central,AFW,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.09088,0.095283,0.096612,0.112376,0.133258,0.184803,...,0.490867,0.504655,0.507671,0.480743,0.472959,0.476438,0.515544,,,
4,Angola,AGO,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.100835,0.082204,0.210533,0.202739,0.213562,0.205891,...,1.204799,1.261542,1.285365,1.260921,1.227703,1.034317,0.88738,,,


In [48]:
#Drop unecessary columns

Co2_emmission.drop(columns=["Indicator Name","Indicator Code", "Country Code","Unnamed: 65"], inplace=True)

In [49]:
# for c in Co2_emmission["Country Name"].unique():
#     if c in global_temps['Country'].unique():
#         print(c)

In [50]:
#Check what countries are in both files

Countries = [ c for c in Co2_emmission["Country Name"].unique() if c in global_temps['Country'].unique()]

In [11]:
Countries

['Australia', 'Brazil', 'Canada', 'China', 'India', 'United States']

In [51]:
new_rows_for_co2_dataframe = []

for col in Co2_emmission.columns[1:]:
    for i, row in Co2_emmission.iterrows():
        temp = pd.DataFrame({
            "Country": [row["Country Name"]],
            "Year": [col],
            "CO2": [row[col]]
        })
        new_rows_for_co2_dataframe.append(temp)

co2_df = pd.concat(new_rows_for_co2_dataframe)

In [53]:
co2_df

Unnamed: 0,Country,Year,CO2
0,Aruba,1960,
0,Africa Eastern and Southern,1960,0.906060
0,Afghanistan,1960,0.046057
0,Africa Western and Central,1960,0.090880
0,Angola,1960,0.100835
...,...,...,...
0,Kosovo,2020,
0,"Yemen, Rep.",2020,
0,South Africa,2020,
0,Zambia,2020,


In [74]:
# clean_co2_df = co2_df.loc[co2_df["Country"].isin(["Switzerland","United Kingdom"])]

In [75]:
# clean_co2_df 

Unnamed: 0,Country,Year,CO2
0,Switzerland,1960,3.664366
0,United Kingdom,1960,11.150759
0,Switzerland,1961,3.751825
0,United Kingdom,1961,11.154139
0,Switzerland,1962,4.333572
...,...,...,...
0,United Kingdom,2018,5.398708
0,Switzerland,2019,
0,United Kingdom,2019,
0,Switzerland,2020,


In [54]:
co2_df['Year'] = co2_df['Year'].apply(str)

In [83]:
# clean_global_temp_df = global_temp_new.loc[global_temp_new["Country Name"].isin(["Switzerland"])]

In [84]:
# clean_global_temp_df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country Name,Latitude,Longitude,Year
723662,1743-11-01,3.544,1.764,Basel,Switzerland,47.42N,8.29E,1743
723663,1743-12-01,,,Basel,Switzerland,47.42N,8.29E,1743
723664,1744-01-01,,,Basel,Switzerland,47.42N,8.29E,1744
723665,1744-02-01,,,Basel,Switzerland,47.42N,8.29E,1744
723666,1744-03-01,,,Basel,Switzerland,47.42N,8.29E,1744
...,...,...,...,...,...,...,...,...
8592839,2013-05-01,9.400,0.396,Zurich,Switzerland,47.42N,8.29E,2013
8592840,2013-06-01,14.444,0.388,Zurich,Switzerland,47.42N,8.29E,2013
8592841,2013-07-01,18.658,0.231,Zurich,Switzerland,47.42N,8.29E,2013
8592842,2013-08-01,16.657,0.490,Zurich,Switzerland,47.42N,8.29E,2013


In [55]:
Climate_data_merged = pd.merge(global_temp_new, co2_df , how="inner", left_on=["Country Name","Year"],right_on=["Country","Year"])

In [56]:
Climate_data_merged

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country Name,Year,Country,CO2
0,1960-01-01,26.454,0.337,Acre,Brazil,1960,Brazil,0.649886
1,1960-02-01,26.157,0.445,Acre,Brazil,1960,Brazil,0.649886
2,1960-03-01,26.418,0.316,Acre,Brazil,1960,Brazil,0.649886
3,1960-04-01,25.393,0.298,Acre,Brazil,1960,Brazil,0.649886
4,1960-05-01,24.852,0.278,Acre,Brazil,1960,Brazil,0.649886
...,...,...,...,...,...,...,...,...
98035,2013-05-01,19.915,0.317,Western Australia,Australia,2013,Australia,16.398646
98036,2013-06-01,16.176,0.300,Western Australia,Australia,2013,Australia,16.398646
98037,2013-07-01,15.745,0.197,Western Australia,Australia,2013,Australia,16.398646
98038,2013-08-01,18.337,0.414,Western Australia,Australia,2013,Australia,16.398646


In [74]:
Climate_data_merged.to_csv('Climate_change_analysis.csv')