In [11]:
import pandas as pd
import numpy as np

data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (Mwh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}
energy_df = pd.DataFrame(data)

print("Original Energy Data with Missing Values:")
print(energy_df)

Original Energy Data with Missing Values:
  Energy Source  Energy Consumption (Mwh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN


In [12]:

cleaned_df = energy_df.dropna()

print("\nData After Removing Rows with Missing Values:")
print(cleaned_df)



Data After Removing Rows with Missing Values:
  Energy Source  Energy Consumption (Mwh)  Cost (Million $)
0         Solar                    1200.0             200.0
4       Biomass                    2500.0             250.0


In [13]:

forward_filled_df = energy_df.fillna(method="ffill")

print("\nData After Forward Filling:")
print(forward_filled_df)



Data After Forward Filling:
  Energy Source  Energy Consumption (Mwh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    1200.0             400.0
2    Hydropower                    2900.0             400.0
3    Geothermal                    2900.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0


  forward_filled_df = energy_df.fillna(method="ffill")


In [14]:
# Impute missing values in 'Energy Consumption (MWh)' with the mean
energy_df["Energy Consumption (Mwh)"].fillna(energy_df["Energy Consumption (Mwh)"].mean(), inplace=True)

# Impute missing values in 'Cost (Million $)' with the mean
energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)

print("\nData After Imputing Missing Values with Mean:")
print(energy_df)



Data After Imputing Missing Values with Mean:
  Energy Source  Energy Consumption (Mwh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Energy Consumption (Mwh)"].fillna(energy_df["Energy Consumption (Mwh)"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)


In [15]:
# Create a flag column indicating missing values in 'Energy Consumption (MWh)'
energy_df['Missing Consumption (Mwh'] = energy_df['Energy Consumption (Mwh)'].isna().astype(int)

print("\nData with Missing Values Flagged:")
print(energy_df)


Data with Missing Values Flagged:
  Energy Source  Energy Consumption (Mwh)  Cost (Million $)  \
0         Solar                    1200.0             200.0   
1          Wind                    2450.0             400.0   
2    Hydropower                    2900.0             250.0   
3    Geothermal                    2450.0             150.0   
4       Biomass                    2500.0             250.0   
5       Nuclear                    3200.0             250.0   

   Missing Consumption (Mwh  
0                         0  
1                         0  
2                         0  
3                         0  
4                         0  
5                         0  


In [16]:
from sklearn.preprocessing import StandardScaler

# Normalize the 'Energy Consumption (Mwh)' and 'Cost (Million $)'
scaler = StandardScaler()
energy_df[["Energy Consumption (Mwh)", "Cost (Million $)"]] = scaler.fit_transform(
    energy_df[["Energy Consumption (Mwh)", "Cost (Million $)"]]
)
print(energy_df)

  Energy Source  Energy Consumption (Mwh)  Cost (Million $)  \
0         Solar                 -2.005893         -0.654654   
1          Wind                  0.000000          1.963961   
2    Hydropower                  0.722121          0.000000   
3    Geothermal                  0.000000         -1.309307   
4       Biomass                  0.080236          0.000000   
5       Nuclear                  1.203536          0.000000   

   Missing Consumption (Mwh  
0                         0  
1                         0  
2                         0  
3                         0  
4                         0  
5                         0  


In [18]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the 'Energy Consumption (MWh)' and 'Cost (Million $)'
scaler = MinMaxScaler()
energy_df[['Energy Consumption (Mwh)', 'Cost (Million $)']] = scaler.fit_transform(
    energy_df[['Energy Consumption (Mwh)', 'Cost (Million $)']]
)

print("\nData After Normalization (Min-Max Scaling):")
print(energy_df)


Data After Normalization (Min-Max Scaling):
  Energy Source  Energy Consumption (Mwh)  Cost (Million $)  \
0         Solar                     0.000               0.2   
1          Wind                     0.625               1.0   
2    Hydropower                     0.850               0.4   
3    Geothermal                     0.625               0.0   
4       Biomass                     0.650               0.4   
5       Nuclear                     1.000               0.4   

   Missing Consumption (Mwh  
0                         0  
1                         0  
2                         0  
3                         0  
4                         0  
5                         0  


In [20]:

energy_encoded_df = pd.get_dummies(energy_df, columns=["Energy Source"])

print("\nData After One-Hot Encoding Categorical Variables:")
print(energy_encoded_df)


Data After One-Hot Encoding Categorical Variables:
   Energy Consumption (Mwh)  Cost (Million $)  Missing Consumption (Mwh  \
0                     0.000               0.2                         0   
1                     0.625               1.0                         0   
2                     0.850               0.4                         0   
3                     0.625               0.0                         0   
4                     0.650               0.4                         0   
5                     1.000               0.4                         0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   Tr

In [21]:

energy_encoded_df["Consumption per $Million"] = energy_encoded_df["Energy Consumption (Mwh)"] / energy_encoded_df["Cost (Million $)"]

print("Data with New Feature (Consumption per $Million):")
print(energy_encoded_df)

Data with New Feature (Consumption per $Million):
   Energy Consumption (Mwh)  Cost (Million $)  Missing Consumption (Mwh  \
0                     0.000               0.2                         0   
1                     0.625               1.0                         0   
2                     0.850               0.4                         0   
3                     0.625               0.0                         0   
4                     0.650               0.4                         0   
5                     1.000               0.4                         0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True