In [5]:

import pandas as pd
import numpy as np

data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

energy_df = pd.DataFrame(data)

print(energy_df)

  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN


In [6]:
#remove rows with any missing values
cleaned_df = energy_df.dropna()

print("\nData After Removing Rows with Missing Values:")
cleaned_df.head()


Data After Removing Rows with Missing Values:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


In [8]:
# Calculate the mean of 'Energy Consumption (MWh)'
mean_consumption = energy_df['Energy Consumption (MWh)'].mean()

# Fill NaN values with the mean
energy_df['Energy Consumption (MWh)'].fillna(mean_consumption, inplace=True)

energy_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df['Energy Consumption (MWh)'].fillna(mean_consumption, inplace=True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,


In [13]:
#forward fill missing values
forward_filled_df = energy_df.fillna(method='ffill')
print("\nData After Forward Filling:")
print(energy_df)
print("\nData After Forward Filling: ")
forward_filled_df.head()


Data After Forward Filling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN

Data After Forward Filling: 


  forward_filled_df = energy_df.fillna(method='ffill')


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,400.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [14]:
#Create a flag column indicating missing values in 'Energy Consumption (MWh)'
energy_df['Missing_Consumption'] = energy_df['Energy Consumption (MWh)'].isnull().astype(int)
energy_df["Missing Cost"] = energy_df["Cost (Million $)"].isna().astype(int)

print("\nData with Missing Values Flagged:")
energy_df.head()


Data with Missing Values Flagged:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $),Missing_Consumption,Missing Cost
0,Solar,1200.0,200.0,0,0
1,Wind,2450.0,400.0,0,0
2,Hydropower,2900.0,,0,1
3,Geothermal,2450.0,150.0,0,0
4,Biomass,2500.0,250.0,0,0


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]])
print("\nData After Min-Max Scaling:")
print(energy_df)


Data After Min-Max Scaling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar                     0.000               0.2   
1          Wind                     0.625               1.0   
2    Hydropower                     0.850               NaN   
3    Geothermal                     0.625               0.0   
4       Biomass                     0.650               0.4   
5       Nuclear                     1.000               NaN   

   Missing_Consumption  Missing Cost  
0                    0             0  
1                    0             0  
2                    0             1  
3                    0             0  
4                    0             0  
5                    0             1  


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]])
print("\nData After Standard Scaling:")
print(energy_df)


Data After Standard Scaling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar             -2.005893e+00         -0.534522   
1          Wind              3.563181e-16          1.603567   
2    Hydropower              7.221213e-01               NaN   
3    Geothermal              3.563181e-16         -1.069045   
4       Biomass              8.023570e-02          0.000000   
5       Nuclear              1.203536e+00               NaN   

   Missing_Consumption  Missing Cost  
0                    0             0  
1                    0             0  
2                    0             1  
3                    0             0  
4                    0             0  
5                    0             1  


In [18]:
 #One-hot encode the 'Energy Source' column

 energy_encoded_df = pd.get_dummies(energy_df, columns=['Energy Source'])
 print("\nData After One-Hot Encoding:")
 print(energy_encoded_df)


Data After One-Hot Encoding:
   Energy Consumption (MWh)  Cost (Million $)  Missing_Consumption  \
0             -2.005893e+00         -0.534522                    0   
1              3.563181e-16          1.603567                    0   
2              7.221213e-01               NaN                    0   
3              3.563181e-16         -1.069045                    0   
4              8.023570e-02          0.000000                    0   
5              1.203536e+00               NaN                    0   

   Missing Cost  Energy Source_Biomass  Energy Source_Geothermal  \
0             0                  False                     False   
1             0                  False                     False   
2             1                  False                     False   
3             0                  False                      True   
4             0                   True                     False   
5             1                  False                     False   

  