## Importing data

In [1]:
from os import path
import urllib
import sys

def check_or_download_file(fullpath):
    filename = path.basename(fullpath)
    print(filename)
    if sys.version_info >= (3, 0):
        if not(path.isfile(filename)):
            urllib.request.urlretrieve(fullpath, filename)
    else:
        if not(path.isfile(filename)):
            urllib.urlretrieve(fullpath, filename)
            
def unzip_file(zipfilename):
    zip = zipfile.ZipFile(zipfilename)
    zip.extractall()

In [2]:
import pandas as pd
import zipfile
    
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/'
filename = 'household_power_consumption'
zipfilename = filename + '.zip'

check_or_download_file(url + zipfilename)
unzip_file(zipfilename)

# read csv
na_values = ['?', '']
fields = ['Date', 'Time', 'Global_active_power']
df = pd.read_csv(filename + '.txt', sep=';', nrows=200000, na_values=na_values, usecols=fields)

household_power_consumption.zip


## 1) Détecter et dénombrer le nombre de lignes ayant des valeurs manquantes.

In [3]:
df[df["Global_active_power"].isnull()]

Unnamed: 0,Date,Time,Global_active_power
6839,21/12/2006,11:23:00,
6840,21/12/2006,11:24:00,
19724,30/12/2006,10:08:00,
19725,30/12/2006,10:09:00,
41832,14/1/2007,18:36:00,
61909,28/1/2007,17:13:00,
98254,22/2/2007,22:58:00,
98255,22/2/2007,22:59:00,
142588,25/3/2007,17:52:00,
190497,28/4/2007,00:21:00,


## 2) Supprimer toutes les lignes avec des valeurs manquantes.

In [4]:
df.dropna(inplace=True)
df

Unnamed: 0,Date,Time,Global_active_power
0,16/12/2006,17:24:00,4.216
1,16/12/2006,17:25:00,5.360
2,16/12/2006,17:26:00,5.374
3,16/12/2006,17:27:00,5.388
4,16/12/2006,17:28:00,3.666
5,16/12/2006,17:29:00,3.520
6,16/12/2006,17:30:00,3.702
7,16/12/2006,17:31:00,3.700
8,16/12/2006,17:32:00,3.668
9,16/12/2006,17:33:00,3.662


## 3) Créer une Time Series

In [5]:
df.set_index(pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S'), inplace=True)
df

Unnamed: 0,Date,Time,Global_active_power
2006-12-16 17:24:00,16/12/2006,17:24:00,4.216
2006-12-16 17:25:00,16/12/2006,17:25:00,5.360
2006-12-16 17:26:00,16/12/2006,17:26:00,5.374
2006-12-16 17:27:00,16/12/2006,17:27:00,5.388
2006-12-16 17:28:00,16/12/2006,17:28:00,3.666
2006-12-16 17:29:00,16/12/2006,17:29:00,3.520
2006-12-16 17:30:00,16/12/2006,17:30:00,3.702
2006-12-16 17:31:00,16/12/2006,17:31:00,3.700
2006-12-16 17:32:00,16/12/2006,17:32:00,3.668
2006-12-16 17:33:00,16/12/2006,17:33:00,3.662


## 4) Afficher le graphique des moyennes journalières entre le 1er janvier et le 30 avril 2007.

In [None]:
df_range = df['2007-01-01':'2007-04-30']
means = df_range.groupby(df_range.index.date).mean()
means.set_index(pd.to_datetime(means.index), inplace=True)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
fig, ax = plt.subplots(figsize=(16,6))
plt.title('Power consumption between January and April 2007')
ax.set_xlabel('Time')
ax.set_ylabel('kW')
ax.plot(means)

legend = plt.legend(['Power consumption (kW)'], frameon=True, loc=2)
legend_frame = legend.get_frame()
legend_frame.set_color('white')
legend_frame.set_edgecolor('black')

On constate deux fortes chutes de la consommation éléctrique fin février et début avril. On peut émettre l'hypothèse que 
la température a fortement augmenté durant ces deux périodes.

## 5) Chargement des données suplémentaires concernant la température

In [None]:
tp_url = 'http://josephsalmon.eu/enseignement/TELECOM/MDI720/datasets/'
temperature_filename = 'TG_STAID011249.txt'

# fetch file
check_or_download_file(tp_url + temperature_filename)

In [None]:
date_converter = lambda date_string: pd.to_datetime(date_string, format='%Y%m%d')

na_values2 = ['?', '', 'NaN', 'Nan', 'nan']
temperature_df = pd.read_csv(temperature_filename, sep=',', na_values=['-9999'], skiprows=20, skipinitialspace=True, usecols=['DATE', 'TG'], dtype={'TG': float}, converters={'DATE': date_converter})
temperature_df.dropna(inplace=True)
temperature_df['TG'] = temperature_df['TG'] / 10
temperature_df.set_index('DATE', inplace=True)
temperature_df


## 6) Créer une Time Series pandas des températures (1er janvier - 30 avril 2007)

In [None]:
filtered_temperatures = temperature_df['2007-01-01':'2007-04-30']

fig, ax1 = plt.subplots(figsize=(16, 6))

temperature_plot, = ax1.plot(filtered_temperatures, color='b')
ax1.set_xlabel('Time')
ax1.set_ylabel('°C')
ax1.tick_params('y', colors='b')

ax2 = ax1.twinx()
power_plot, = ax2.plot(means, color='g')
ax2.set_ylabel('kW', color='g')
ax2.tick_params('y', colors='g')

legend = plt.legend([temperature_plot, power_plot], ['Temperatures (°C)', 'Power consumption (kW)'], frameon=True, loc=2)
legend_frame = legend.get_frame()
legend_frame.set_color('white')
legend_frame.set_edgecolor('black')

plt.show()

## 7) Proposer une visualisation de la pollution moyenne par année (pour l'ozone).

In [None]:
airparif_zipfile = 'airparifabae1bd78def4fe8a409ab8c95fc4608.zip'

# fetch file
check_or_download_file(tp_url + airparif_zipfile)
unzip_file(airparif_zipfile)

airdf = pd.read_csv(airparif_zipfile, sep=';', na_values=['n/d'], skiprows=[1], usecols=['date', 'heure', 'NO2', 'O3'], dtype={'date': str, 'heure': int})
airdf.dropna(inplace=True)
airdf['heure'] = (airdf['heure'] - 1).astype(str) + ':59:59'
airdf_index = pd.to_datetime(airdf['date'] + ' ' + airdf['heure'], format='%d/%m/%Y %H:%M:%S')
airdf.set_index(airdf_index, inplace=True)
airdf.drop(['date', 'heure'], inplace=True, axis=1)

air_by_year = airdf.groupby(airdf.index.year)

### Let's focus on the means per year

In [None]:
air_means = air_by_year.mean()
years = air_means.index.values
o3_means = air_means['O3'].values

fig, ax = plt.subplots(figsize=(16, 10))

scatter = ax.scatter(years, o3_means, color='r')
scatter.set_edgecolor('blue')
ax.set_title('Average pollution in Paris from 2008 to 2016')
ax.set_ylabel('mg/m3')

legend = plt.legend(['o3 average pollution'], frameon=True, loc=2)
legend_frame = legend.get_frame()
legend_frame.set_color('white')
legend_frame.set_edgecolor('black')

plt.show()

## 8) Proposer une visualisation de la pollution la plus critique par année pour NO2 et O3.

In [None]:
air_max = air_by_year.max()
o3_maxes = air_max['O3'].values
no2_maxes = air_max['NO2'].values

fig, ax = plt.subplots(figsize=(16, 10))

o3_scatter = ax.scatter(years, o3_maxes, color='r')
o3_scatter.set_edgecolor('blue')
no2_scatter = ax.scatter(years, no2_maxes, color='g')
no2_scatter.set_edgecolor('blue')
ax.set_title('Maximal pollution in Paris from 2008 to 2016')
ax.set_ylabel('mg/m3')

legend_part = ' yearly maximal pollution (mg/m3)'
legend = plt.legend([no2_scatter, o3_scatter], ['no2' + legend_part, 'o3' + legend_part], frameon=True)
legend_frame = legend.get_frame()
legend_frame.set_color('white')
legend_frame.set_edgecolor('black')

plt.show()

### Now let's combine the indicators

In [None]:
o3_year_buckets = [ s for s in air_by_year.apply(lambda g: g['O3'].values) ]
no2_year_buckets = [ s for s in air_by_year.apply(lambda g: g['NO2'].values) ]
first_year = airdf.index.year[0]
last_year = airdf.index.year[-1]
years_ticks = [x + 1 for x in range(last_year - first_year + 1)]
xticks_labels = [str(x) for x in range(first_year, last_year + 1)]

fig, axes = plt.subplots(ncols=2, figsize=(16, 10))

no2_parts = axes[0].violinplot(no2_year_buckets, showmeans=True, showmedians=False)
axes[1].set_title('NO2')
for pc in no2_parts['bodies']:
    pc.set_facecolor('green')
    pc.set_edgecolor('black')

no2_legend = axes[0].legend(['no2 pollution distribution'], frameon=True)
no2_legend_frame = no2_legend.get_frame()
no2_legend_frame.set_color('white')
no2_legend_frame.set_edgecolor('black')

o3_parts = axes[1].violinplot(o3_year_buckets, showmeans=True, showmedians=False)
axes[0].set_title('O3')
for pc in o3_parts['bodies']:
    pc.set_facecolor('red')
    pc.set_edgecolor('black')

o3_legend = axes[1].legend(['o3 pollution distribution'], frameon=True)
o3_legend_frame = o3_legend.get_frame()
o3_legend_frame.set_color('white')
o3_legend_frame.set_edgecolor('black')

for i in range(2):
    axes[i].set_xticks(years_ticks)
    axes[i].set_xticklabels(xticks_labels)
    axes[i].set_ylabel('mg/m3')

fig.suptitle('Pollution distribution in Paris from 2008 to 2016')
plt.show()

## 9) Donner une représentation par mois de la pollution. Quel est le mois le plus pollué ?

In [None]:
import calendar

air_by_month = airdf.groupby(airdf.index.month).sum()
fig, ax = plt.subplots(figsize=(16, 10))

o3_bar_container = ax.bar(air_by_month.index, air_by_month['O3'], color='r', width=0.4, align='edge')
for bar in o3_bar_container.get_children():
    bar.set_edgecolor('black')

no2_bar_container = ax.bar(air_by_month.index, air_by_month['NO2'], color='g', width=-0.4, align='edge')
for bar in no2_bar_container.get_children():
    bar.set_edgecolor('black')
    
ax.set_title('Total pollution by month in Paris from 2008 to 2016')
months_index = range(1, 13)
months_labels = [calendar.month_name[m_index] for m_index in months_index]
plt.xticks(months_index, months_labels, rotation=45)
ax.set_ylabel('mg/m3')

legend = plt.legend([no2_bar_container, o3_bar_container], ['no2 pollution (mg/m3)', 'o3 pollution (mg/m3)'], frameon=True)
legend_frame = legend.get_frame()
legend_frame.set_color('white')
legend_frame.set_edgecolor('black')

plt.show()

## Quel est le mois le plus pollué (pour chacun des polluants)?

In [None]:
def print_most_polluted_month(pollutant):
    most_polluted_month = months_labels[air_by_month[pollutant].idxmax() - 1]
    print('The most polluted month in terms of ' + pollutant + ' is the month of ' + most_polluted_month)

print_most_polluted_month('O3')
print_most_polluted_month('NO2')