#Oil Extraction Production Forecasting
<br/>
<img src="https://www.nsenergybusiness.com/wp-content/uploads/sites/4/2022/07/refinery-ga56d4972f_640.jpg" />

In [0]:
catalog = "workshop"
db = "default"
src_table = "ademianczuk_oil_yield"

In [0]:
#Load the delta table into a PySpark dataframe
df = spark.table(f"{catalog}.{db}.{src_table}")

In [0]:
df.printSchema()
df.show(5, truncate=False)  # Display first 5 rows

In [0]:
display(df.describe())

In [0]:
# If any columns have missing values, we need to decide whether to fill, drop, or interpolate them. Sometimes empty or missing values may be valuable though.

from pyspark.sql.functions import col, sum

df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

In [0]:
#Let's look for some seasonality based on the timeseries plot
import matplotlib.pyplot as plt
import pandas as pd

# Convert PySpark DataFrame to Pandas for plotting
df_pd = df.select("date", "yield_bbl").groupby("date").avg("yield_bbl").orderBy("date").toPandas()

# Plot time series
plt.figure(figsize=(12, 5))
plt.plot(df_pd["date"], df_pd["avg(yield_bbl)"], marker="o", linestyle="-")
plt.xlabel("Date")
plt.ylabel("Average Yield (BBL)")
plt.title("Oil Yield Trend Over Time")
plt.xticks(rotation=45)
plt.show()

In [0]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert PySpark DataFrame to Pandas
df_pd = df.select("date", "temperature", "precipitation").toPandas()

# Convert date to datetime
df_pd["date"] = pd.to_datetime(df_pd["date"])

# Resample to weekly average to reduce data size
df_resampled = df_pd.set_index("date").resample("W").mean().reset_index()

# Create figure and axes
fig, ax1 = plt.subplots(figsize=(12, 5))

# Plot temperature on primary y-axis
ax1.plot(df_resampled["date"], df_resampled["temperature"], color="red", marker="o", linestyle="-", label="Temperature (°C)")
ax1.set_xlabel("Date")
ax1.set_ylabel("Temperature (°C)", color="red")
ax1.tick_params(axis="y", labelcolor="red")

# Create secondary y-axis for precipitation
ax2 = ax1.twinx()
ax2.bar(df_resampled["date"], df_resampled["precipitation"], color="blue", alpha=0.5, label="Precipitation (mm)")
ax2.set_ylabel("Precipitation (mm)", color="blue")
ax2.tick_params(axis="y", labelcolor="blue")

# Title and layout
plt.title("Temperature and Precipitation Over Time (Weekly Avg)")
fig.tight_layout()
plt.show()

In [0]:
# We need to check for abnormally high or low values in oil yield (barrels produced), well pressure and oil price.
# Convert to Pandas for visualization
df_outliers = df.select(["yield_bbl", "temperature", "well_pressure", "oil_price"]).toPandas()

# Plot boxplots
df_outliers.plot(kind="box", subplots=True, layout=(2, 2), figsize=(10, 8), sharex=False, sharey=False)
plt.suptitle("Box Plot of Key Features")
plt.show()

In [0]:
# Let's look for some field correlation
import seaborn as sns

# Convert PySpark DF to Pandas
df_corr = df.select(["yield_bbl", "temperature", "precipitation", "humidity", "wind_speed", "well_pressure", "sand_quality", "drilling_efficiency", "oil_price"]).toPandas()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_corr.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert PySpark DataFrame to Pandas
df_pd = df.select("yield_bbl", "temperature", "precipitation").toPandas()

# Create the KDE plot (bell curve)
plt.figure(figsize=(10, 6))

# Plot yield distribution
sns.kdeplot(df_pd["yield_bbl"], label="Yield (BBL)", color="red", linewidth=2)

# Labels and title
plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Bell Curve of Yield")
plt.legend()
plt.grid(True)

# Show plot
plt.show()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert PySpark DataFrame to Pandas
df_pd = df.select("yield_bbl", "temperature", "precipitation").toPandas()

# Create the KDE plot (bell curve)
plt.figure(figsize=(10, 6))

# Plot yield distribution
sns.kdeplot(df_pd["temperature"], label="Temperature", color="green", linewidth=2)

# Labels and title
plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Bell Curve of temperature")
plt.legend()
plt.grid(True)

# Show plot
plt.show()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert PySpark DataFrame to Pandas
df_pd = df.select("yield_bbl", "temperature", "precipitation").toPandas()

# Create the KDE plot (bell curve)
plt.figure(figsize=(10, 6))

# Plot yield distribution
sns.kdeplot(df_pd["precipitation"], label="Precipitation (mm)", color="blue", linewidth=2)

# Labels and title
plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Bell Curve of precipitation")
plt.legend()
plt.grid(True)

# Show plot
plt.show()