# Introduction to Python for Machine Learning

#### The data provided in this Github link is the fuel quality data from the Federal Energy Regulatory Commission which is provided by the United States Energy Information Administration. The data consists of the following columns:

- 'Record_id' : record id

- 'Utility_id_ferc1': Utility id assigned by the FERC

- 'Report_year': year of report

- 'Plant_name_ferc1': the name of the plant

- 'Fuel_type_code_pudl': the type of fuel

- 'Fuel_unit': the unit of fuel

- 'Fuel_qty_burned': the quantity of fuel burned

- 'Fuel_mmbtu_per_unit': the measure of energy per unit

- 'fuel_cost_per_unit_burned': the fuel cost per unit burned

- 'Fuel_cost_per_unit_delivered': the cost of fuel delivered per unit

- 'fuel_cost_per_mmbtu': the cost of fuel per mmbtu

In [2]:
# Import Necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/WalePhenomenon/climate_change/master/fuel_ferc1.csv')
# View the first few observations
data.head()

Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,fuel_type_code_pudl,fuel_unit,fuel_qty_burned,fuel_mmbtu_per_unit,fuel_cost_per_unit_burned,fuel_cost_per_unit_delivered,fuel_cost_per_mmbtu
0,f1_fuel_1994_12_1_0_7,1,1994,rockport,coal,ton,5377489.0,16.59,18.59,18.53,1.121
1,f1_fuel_1994_12_1_0_10,1,1994,rockport total plant,coal,ton,10486945.0,16.592,18.58,18.53,1.12
2,f1_fuel_1994_12_2_0_1,2,1994,gorgas,coal,ton,2978683.0,24.13,39.72,38.12,1.65
3,f1_fuel_1994_12_2_0_7,2,1994,barry,coal,ton,3739484.0,23.95,47.21,45.99,1.97
4,f1_fuel_1994_12_2_0_10,2,1994,chickasaw,gas,mcf,40533.0,1.0,2.77,2.77,2.57


In [7]:
data.report_year.dtype


dtype('int64')

In [8]:
data.groupby('report_year')['fuel_cost_per_unit_delivered'].mean().idxmax()

1997

In [9]:
data.groupby('fuel_type_code_pudl')['fuel_cost_per_unit_burned'].mean().idxmin()

'gas'

In [10]:
data['fuel_mmbtu_per_unit'].describe()

count    29523.000000
mean         8.492111
std         10.600220
min          0.000001
25%          1.024000
50%          5.762694
75%         17.006000
max        341.260000
Name: fuel_mmbtu_per_unit, dtype: float64

In [11]:
data['fuel_qty_burned'].skew()

15.851495469109503

In [12]:
data['fuel_qty_burned'].kurt()

651.3694501337732

In [13]:
data.isnull().sum()

record_id                         0
utility_id_ferc1                  0
report_year                       0
plant_name_ferc1                  0
fuel_type_code_pudl               0
fuel_unit                       180
fuel_qty_burned                   0
fuel_mmbtu_per_unit               0
fuel_cost_per_unit_burned         0
fuel_cost_per_unit_delivered      0
fuel_cost_per_mmbtu               0
dtype: int64

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29523 entries, 0 to 29522
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   record_id                     29523 non-null  object 
 1   utility_id_ferc1              29523 non-null  int64  
 2   report_year                   29523 non-null  int64  
 3   plant_name_ferc1              29523 non-null  object 
 4   fuel_type_code_pudl           29523 non-null  object 
 5   fuel_unit                     29343 non-null  object 
 6   fuel_qty_burned               29523 non-null  float64
 7   fuel_mmbtu_per_unit           29523 non-null  float64
 8   fuel_cost_per_unit_burned     29523 non-null  float64
 9   fuel_cost_per_unit_delivered  29523 non-null  float64
 10  fuel_cost_per_mmbtu           29523 non-null  float64
dtypes: float64(5), int64(2), object(4)
memory usage: 2.5+ MB


In [15]:
data['fuel_unit'].isnull().sum()

180

In [16]:
180/29522

0.006097147889709369

In [17]:
data['fuel_unit'].value_counts()

mcf       11354
ton        8958
bbl        7998
gramsU      464
mmbtu       180
kgU         110
mwhth       100
mwdth        95
gal          84
Name: fuel_unit, dtype: int64

In [18]:
data.corr()['fuel_cost_per_unit_burned'].sort_values()

utility_id_ferc1               -0.037863
fuel_qty_burned                -0.018535
fuel_mmbtu_per_unit            -0.010034
fuel_cost_per_mmbtu            -0.000437
fuel_cost_per_unit_delivered    0.011007
report_year                     0.013599
fuel_cost_per_unit_burned       1.000000
Name: fuel_cost_per_unit_burned, dtype: float64

In [19]:
a = data[(data['fuel_type_code_pudl'] == 'coal') & (data.report_year == 1994)]['fuel_cost_per_unit_burned'].sum()
print(a)

b = data[(data['fuel_type_code_pudl'] == 'coal') & (data.report_year == 1998)]['fuel_cost_per_unit_burned'].sum()
print(b)

(a - b) / a

14984.572000000002
11902.597


0.20567654518260528