In [1]:
import pandas as pd
import altair as alt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
alt.renderers.enable("default")

RendererRegistry.enable('default')

# Import and clean data

## Import Data

In [2]:
crude_oil_data = pd.read_csv("U.S._crude_oil_production.csv")

## Cleaning Data

In [3]:
crude_oil_data.drop(['Arizona', 'Virginia'], axis = 1, inplace=True)

In [4]:
crude_oil_data = crude_oil_data.rename(columns= {"U.S. Crude Oil ":"US_total", 
                                                 "Federal Offshore Gulf of Mexico Crude Oil": "Gulf_of_Mexico",
                                                "Federal Offshore Pacific Crude Oil":"Pacific_oil"})

crude_oil_data["Date"] = pd.to_datetime(crude_oil_data["Month"])
crude_oil_data.drop("Month", axis = 1, inplace = True)
crude_oil_data["Year"] = crude_oil_data["Date"].dt.year
crude_oil_data["Month"] = crude_oil_data["Date"].dt.month
crude_oil_data.sample(2)

Unnamed: 0,US_total,Alabama,Alaska,Arkansas,California,Colorado,Gulf_of_Mexico,Pacific_oil,Florida,Idaho,...,Pennsylvania,South Dakota,Wyoming,West Virginia,Utah,Texas,Tennessee,Date,Year,Month
7,5138,21,679,16,579,85,1281,67,2,0,...,8,5,144,3,65,1133,1,2009-01-01,2009,1
115,9995,16,508,14,463,436,1631,14,5,0,...,17,4,222,28,100,3894,1,2018-01-01,2018,1


In [5]:
crude_oil_data = crude_oil_data.melt(id_vars=["US_total","Date", "Year", "Month"],
                                     var_name="Area", value_name="Production")

In [6]:
columns_titles = ["Area","Production","US_total" ,"Year", "Month", "Date"]
crude_oil_data=crude_oil_data.reindex(columns=columns_titles)
crude_oil_data.reset_index(drop=True)

Unnamed: 0,Area,Production,US_total,Year,Month,Date
0,Alabama,21,5138,2008,6,2008-06-01
1,Alabama,21,5177,2008,7,2008-07-01
2,Alabama,21,5003,2008,8,2008-08-01
3,Alabama,21,3974,2008,9,2008-09-01
4,Alabama,21,4738,2008,10,2008-10-01
...,...,...,...,...,...,...
3867,Tennessee,1,10248,2018,2,2018-02-01
3868,Tennessee,1,10461,2018,3,2018-03-01
3869,Tennessee,1,10475,2018,4,2018-04-01
3870,Tennessee,1,10443,2018,5,2018-05-01


In [7]:
new = crude_oil_data.groupby(["Year","Area"]).sum().reset_index()
new["Prod_ratio"] = round(new["Production"]/new["US_total"]*100,2)
#adding total production in each year by each states

## Final Datasets

In [8]:
top10_data = crude_oil_data.loc[crude_oil_data['Area'].isin(["Texas","Gulf_of_Mexico","North Dakota","Alaska","California",
                                                          "Oklahoma","New Mexico","Colorado","Wyoming","Louisiana"])]

total_production_ratio = new.loc[new['Area'].isin(["Texas","Gulf_of_Mexico","North Dakota","Alaska","California","Oklahoma",
                                                 "New Mexico","Colorado","Wyoming","Louisiana"])]

# Interactive visualisations

In [9]:
scale = alt.Scale(domain=["Texas","Gulf_of_Mexico","North Dakota","Alaska","California","Oklahoma","New Mexico",
                          "Colorado","Wyoming","Louisiana"],
                 range=['#b80058','#ebac23','#008cf9','#006e00','#00bbad','#d163e6','#b24502','#ff9287','#5954d6','#00c6f8'])
color = alt.Color('Area:O',scale=scale)

In [10]:
brush = alt.selection(type='interval',encodings=['x'])
click = alt.selection_multi(encodings=['color'], toggle="true")
highlight = alt.selection(type='single', on='mouseover',
                          fields=['Area'], nearest=True)

## US Top 10 Yearly Crude Oil Producing Areas 

In [11]:
top10 = alt.Chart(data=top10_data).encode(
    alt.X('Date:T',title="Year"),
    alt.Y('sum(Production):Q',title="Production (BPD)"),
    tooltip=["Production"],
    color="Area:O"
).properties(
    width=750,
    height=400
)

circle = top10.mark_circle().encode(
    opacity=alt.value(0),
).add_selection(
    brush,highlight
).transform_filter(
    click
)


line = top10.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
).transform_filter(
    click
)
#continuous line plot of top 10 oil producing states with brush selection interval and 
#tooltip to show production for that particular month 

In [12]:
bars = alt.Chart(data=top10_data).mark_bar().encode(
    x=alt.X('sum(Production):Q', axis=alt.Axis(title = "Cumulative Production (BPD)")),
    y=alt.Y('Area:N',sort='-x'),
    tooltip= ["sum(Production)"],
    color=alt.condition(click, color, alt.value('lightgray'))
    
).transform_filter(
    brush
).properties(
    width=750,
).add_selection(
    click,highlight
)
#bar plot of top 10 producers showing cumulative production, 
#sorted in decending order and change interactively with brush selection interval

In [13]:
mean_line = alt.Chart(data=top10_data)\
.mark_rule(color="black")\
.encode(y="mean(Production):Q",
       size=alt.SizeValue(3),
       tooltip=["mean(Production)"])\
.transform_filter(brush)\
.transform_filter(click)
#line showing mean production for brush selected interval

In [14]:
percentage = alt.Chart(data=total_production_ratio).encode(
    x='Year:N',
    y=alt.Y("Prod_ratio:Q", axis=alt.Axis(title = "Production Contribution to US Total %")),
    color='Area:N'
)

points = percentage.mark_circle().encode(
    opacity=alt.value(1),
    tooltip=[alt.Tooltip("Prod_ratio", title='Production percentage')],
    color=alt.condition(brush, color, alt.value('lightgray'))
).add_selection(
    brush,highlight
).properties(
    width=750
).transform_filter(
    click
)


line2 = percentage.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
).transform_filter(
    click
)
#line plot showing percentage of total US crude oil production for top 10 Areas/states

In [15]:
alt.vconcat(
    circle+line+mean_line,
    bars,
    points+line2,
    title="US Top 10 Crude Oil Producing Areas",
).configure_title(anchor="middle")