In [2]:
username = "alextran21211"
password = "database12345"
host = "cluster0.zsych.mongodb.net"
database = "group_5_project"
collection = "co2_emission"


In [3]:
from pymongo import MongoClient # import mongo client to connect
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, mean, stddev, min, max, desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import urllib.parse
from sklearn.preprocessing import MinMaxScaler

In [4]:
username = username
password = urllib.parse.quote(password)
host = host
url = "mongodb+srv://{}:{}@{}/?retryWrites=true&w=majority".format(username,
password, host)
# connect to the database
client = MongoClient(url)

In [5]:
db = client['group_5_project']
collection = db["co2_emission"]

documents = collection.find()

data = []

for doc in documents:
    for country, country_data in doc.items():
        if country == "_id":
            continue  
        iso_code = country_data.get('iso_code')
        for entry in country_data.get('data', []):
            year = entry.get('year')
            population = entry.get('population')
            co2 = entry.get('cumulative_luc_co2')
           
            data.append({
                'Country': country,
                'ISO_Code': iso_code,
                'Year': year,
                'Population': population,
                'CO2': co2
            })


df = pd.DataFrame(data)


In [6]:
print(df.head())

       Country ISO_Code  Year  Population        CO2
0  Afghanistan      AFG  1850   3752993.0   2.979601
1  Afghanistan      AFG  1851   3767956.0   5.981443
2  Afghanistan      AFG  1852   3783940.0   9.002998
3  Afghanistan      AFG  1853   3800954.0  12.041333
4  Afghanistan      AFG  1854   3818038.0  15.094068


In [7]:
print(df.info())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47415 entries, 0 to 47414
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     47415 non-null  object 
 1   ISO_Code    39548 non-null  object 
 2   Year        47415 non-null  int64  
 3   Population  39414 non-null  float64
 4   CO2         37022 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.8+ MB
None


In [8]:
print(df.isnull().sum()) 

Country           0
ISO_Code       7867
Year              0
Population     8001
CO2           10393
dtype: int64


In [9]:
df['Population'] = df['Population'].fillna(df['Population'].mean()) 
df['CO2'] = df['CO2'].fillna(df['CO2'].mean())  
df['ISO_Code'] = df['ISO_Code'].dropna

In [10]:
df['CO2_per_capita'] = df['CO2'] / df['Population']

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
print(df.isnull().sum()) 

Country           0
ISO_Code          0
Year              0
Population        0
CO2               0
CO2_per_capita    0
dtype: int64


In [13]:
df = df.drop(columns=['ISO_Code'])
print(df.head())

       Country  Year  Population        CO2  CO2_per_capita
0  Afghanistan  1850   3752993.0   2.979601    7.939267e-07
1  Afghanistan  1851   3767956.0   5.981443    1.587450e-06
2  Afghanistan  1852   3783940.0   9.002998    2.379266e-06
3  Afghanistan  1853   3800954.0  12.041333    3.167977e-06
4  Afghanistan  1854   3818038.0  15.094068    3.953357e-06


In [14]:
scaler = MinMaxScaler()
df[['Population', 'CO2', 'CO2_per_capita']] = scaler.fit_transform(df[['Population', 'CO2', 'CO2_per_capita']])

In [15]:
print(df.describe())        # Summary statistics
print(df.head())

               Year    Population           CO2  CO2_per_capita
count  47415.000000  47415.000000  47415.000000    47415.000000
mean    1926.781609      0.007643      0.017994        0.000938
std       59.561600      0.037602      0.048560        0.012920
min     1750.000000      0.000000      0.000000        0.000000
25%     1883.000000      0.000073      0.005530        0.000006
50%     1930.000000      0.000521      0.006442        0.000009
75%     1976.000000      0.007104      0.017994        0.000012
max     2022.000000      1.000000      1.000000        1.000000
       Country  Year  Population       CO2  CO2_per_capita
0  Afghanistan  1850    0.000471  0.005472        0.000006
1  Afghanistan  1851    0.000472  0.005476        0.000006
2  Afghanistan  1852    0.000474  0.005480        0.000006
3  Afghanistan  1853    0.000477  0.005483        0.000006
4  Afghanistan  1854    0.000479  0.005487        0.000006


In [16]:
df.to_csv("co2_emission_preprocessed.csv", index=False)

In [17]:
df.head()

Unnamed: 0,Country,Year,Population,CO2,CO2_per_capita
0,Afghanistan,1850,0.000471,0.005472,6e-06
1,Afghanistan,1851,0.000472,0.005476,6e-06
2,Afghanistan,1852,0.000474,0.00548,6e-06
3,Afghanistan,1853,0.000477,0.005483,6e-06
4,Afghanistan,1854,0.000479,0.005487,6e-06
