In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from gustavo_functions import *

%load_ext autoreload
%autoreload 2

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [11]:
# Load the datasets (make sure to update the file paths if necessary)
global_df = pd.read_csv("data-project3/GlobalLandTemperaturesByCity.csv")
pollution_df = pd.read_csv("data-project3/pollution_us_2000_2016.csv")

In [13]:

### 1 Cleaning the GlobalLandTemperaturesByCity dataset
# Filter only data for the United States
global_df = global_df[global_df["Country"] == "United States"].copy()

# Convert the date column to datetime format
global_df["dt"] = pd.to_datetime(global_df["dt"])

# Extract the year from the date column
global_df["Year"] = global_df["dt"].dt.year

# Remove NaN values in the temperature column
global_df.dropna(subset=["AverageTemperature"], inplace=True)

# Group by city and year, calculating the annual average temperature
global_clean = global_df.groupby(["City", "Year"])['AverageTemperature'].mean().reset_index()

# Extract unique latitude values per city
latitude_df = global_df[["City", "Latitude","Longitude"]].drop_duplicates()

### 2 Cleaning the Greenhouse Gas dataset
# Filter only "United States of America"
#green_df = green_df[green_df["country_or_area"] == "United States of America"].copy()

### 3 Cleaning the Pollution dataset
# Convert the date column to datetime format
pollution_df["Date Local"] = pd.to_datetime(pollution_df["Date Local"])

# Extract the year from the date column
pollution_df["Year"] = pollution_df["Date Local"].dt.year

# Select key columns
pollution_clean = pollution_df[["City", "Year", "NO2 Mean", "SO2 Mean", "CO Mean"]]

# Average pollution values by city and year
pollution_clean = pollution_clean.groupby(["City", "Year"]).mean().reset_index()

###  Merging the datasets
# Merge temperature and pollution data
final_df = pd.merge(global_clean, pollution_clean, on=["City", "Year"], how="inner")


# Drop the duplicate year column
#final_df.drop(columns=["year"], inplace=True)

# Merge latitude data
final_df = pd.merge(final_df, latitude_df, on="City", how="left")

# Convert latitude values to numeric format
final_df["Latitude"] = final_df["Latitude"].str.replace("N", "").str.replace("S", "-").astype(float)

# Convert longitude to numeric format
final_df["Longitude"] = final_df["Longitude"].apply(lambda x: float(x[:-1]) * (-1 if x[-1] == 'W' else 1))

###
final_df = final_df.rename(columns={'value': 'CO2-natural-pross'})

# Save the cleaned and merged dataset in CSV format
#final_df.to_csv("cleaned_temperature_pollution_data.csv", index=False)

In [18]:
df=final_df.copy()

In [19]:
# Compute mutual information between Temperature and Pollution variables
X = df[["NO2 Mean", "SO2 Mean", "CO Mean"]]
y = df["AverageTemperature"]

my_scores = mutual_info_regression(X, y)
my_df = pd.DataFrame(my_scores, index=X.columns, columns=["Mutual Information"])
print(my_df)

          Mutual Information
NO2 Mean            0.215682
SO2 Mean            0.246242
CO Mean             0.201956
