In [60]:
import pandas as pd
df = pd.read_csv("./data/RE_technical_merged_2.csv",index_col = None)

In [61]:
print(len(df))

6732


### separating only wind records with an identifier as 'Onshore' in Technology Type column

In [62]:
df_wind = df[df['Technology Type']=='Onshore']

In [63]:
df_wind.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Date Last Researched', 'Country/Area',
       'Project Name', 'Phase Name', 'Project Name in Local Language / Script',
       'Other Name(s)', 'Capacity (MW)', 'Capacity Rating', 'Technology Type',
       'Status', 'Start year', 'Retired year', 'Operator',
       'Operator Name in Local Language / Script', 'Owner',
       'Owner Name in Local Language / Script', 'Hydrogen', 'Latitude',
       'Longitude', 'Location accuracy', 'City', 'Local area (taluk, county)',
       'Major area (prefecture, district)', 'State/Province', 'Subregion',
       'Region', 'GEM location ID', 'GEM phase ID', 'Other IDs (location)',
       'Other IDs (unit/phase)', 'Wiki URL', 'PPA Tariff', 'PLF',
       'PPA Tariff_dq_1', 'PPA Offtaker', 'PPA Tenure', 'PLF_dq_1'],
      dtype='object')

In [64]:
len(df_wind)

745

### Checking for the mandatory columns

In [65]:
missing_columns = [col for col in ['Country/Area', 'Status', 'Technology Type'] if col not in df_wind.columns]
if missing_columns:
    print(f"Warning: The following columns are missing from the DataFrame: {missing_columns}")
else:
    print("No Missing Columns in the data")

No Missing Columns in the data


### dropping columns on given columns to drop or columns to keep

In [66]:
def filter_columns(df, columns_to_drop=None, columns_to_keep=None):
    """
    Drop columns from a DataFrame based on either columns_to_drop or columns_to_keep.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns_to_drop (list, optional): List of column names to drop.
        columns_to_keep (list, optional): List of column names to keep.

    Returns:
        pd.DataFrame: A DataFrame with the specified columns removed or kept.

    Raises:
        ValueError: If both columns_to_drop and columns_to_keep are provided.
    """
    if columns_to_drop and columns_to_keep:
        raise ValueError("Specify either 'columns_to_drop' or 'columns_to_keep', not both.")

    if columns_to_drop:
        # Drop specified columns
        return df.drop(columns=columns_to_drop, errors='ignore')

    if columns_to_keep:
        # Keep only specified columns
        return df[columns_to_keep]

    # If neither is provided, return the DataFrame unchanged
    return df


columns_to_drop = [
        'Unnamed: 0', 'Unnamed: 0.1','Date Last Researched', 'Project Name in Local Language / Script',
        'Retired year', 'Operator', 'Operator Name in Local Language / Script', 
        'Owner Name in Local Language / Script', 'Hydrogen', 'Other IDs (location)', 
        'Other IDs (unit/phase)'
    ]
df_wind = filter_columns(df_wind, columns_to_drop=columns_to_drop)
df_wind.info()

<class 'pandas.core.frame.DataFrame'>
Index: 745 entries, 0 to 6727
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Country/Area                       745 non-null    object 
 1   Project Name                       745 non-null    object 
 2   Phase Name                         745 non-null    object 
 3   Other Name(s)                      101 non-null    object 
 4   Capacity (MW)                      745 non-null    float64
 5   Capacity Rating                    0 non-null      object 
 6   Technology Type                    745 non-null    object 
 7   Status                             745 non-null    object 
 8   Start year                         408 non-null    float64
 9   Owner                              741 non-null    object 
 10  Latitude                           745 non-null    float64
 11  Longitude                          745 non-null    float64
 12

### manual renaming and converting column names to snake case for consistency

In [67]:
df_wind.columns = df_wind.columns.str.strip()

In [68]:
# Dictionary for manual adjustments
manual_adjustments = {
    'Other Name(s)': 'other_names',
    'Capacity (MW)': 'capacity_mw',
    'Local area (taluk, county)': 'local_area_taluk_country',
    'Major area (prefecture, district)':'major_area_perfecture_district'
    }

# Function to convert column names to snake_case with manual adjustments
def convert_to_snake_case(columns, adjustments):
    new_columns = []
    for col in columns:
        if col in adjustments:
            new_columns.append(adjustments[col])
        else:
            new_columns.append(inflection.underscore(col.replace("/","_or_").replace(" ","_")))
    return new_columns

# Apply the function to the DataFrame columns
df_wind.columns = convert_to_snake_case(df_wind.columns, manual_adjustments)

# Display the DataFrame with new column names
print(df_wind.columns)

Index(['country_or_area', 'project_name', 'phase_name', 'other_names',
       'capacity_mw', 'capacity_rating', 'technology_type', 'status',
       'start_year', 'owner', 'latitude', 'longitude', 'location_accuracy',
       'city', 'local_area_taluk_country', 'major_area_perfecture_district',
       'state_or_province', 'subregion', 'region', 'gem_location_id',
       'gem_phase_id', 'wiki_url', 'ppa_tariff', 'plf', 'ppa_tariff_dq_1',
       'ppa_offtaker', 'ppa_tenure', 'plf_dq_1'],
      dtype='object')


### Adding Null Indicator columns to "start_year", "owner", "ppa_tenure", "ppa_offtaker" columns

In [71]:
import pandas as pd
import numpy as np

def add_null_indicator_columns(df, columns, suffix="_is_empty", null_value_label="Yes", not_null_value_label="No"):
    """
    Add indicator columns to the DataFrame to identify null values in the specified columns.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns (list): List of column names to check for null values.
        suffix (str): Suffix to append to the column names for the new indicator columns.
        null_value_label (str): Label for null values (default: "Yes").
        not_null_value_label (str): Label for non-null values (default: "No").

    Returns:
        pd.DataFrame: A DataFrame with new indicator columns added.
    """
    for col in columns:
        new_col_name = f"{col}{suffix}"
        df[new_col_name] = np.where(df[col].isnull(), null_value_label, not_null_value_label)
    return df
columns_to_check = ["start_year", "owner", "ppa_tenure", "ppa_offtaker"]

# Add null indicator columns
df_wind = add_null_indicator_columns(df_wind, columns_to_check)

df_wind.info()

### modifying phase name from -- to 1 

In [77]:
### modifying phase name from -- to 1 
if 'phase_name' in df_wind.columns:
    df_wind['phase_name'] = df_wind['phase_name'].replace('--', 1)  
else:
    print("Warning: 'Phase Name' column not found in the dataset.")
    df_wind['phase_name'] = None

In [78]:
df_wind

Unnamed: 0,country_or_area,project_name,phase_name,other_names,capacity_mw,capacity_rating,technology_type,status,start_year,owner,...,ppa_tariff,plf,ppa_tariff_dq_1,ppa_offtaker,ppa_tenure,plf_dq_1,start_year_is_empty,owner_is_empty,ppa_tenure_is_empty,ppa_offtaker_is_empty
0,India,ABC Hybrid SECI III wind farm,,,190.0,,Onshore,pre-construction,,ABC Renewable Energy,...,,18.82,,,,,Yes,No,Yes,Yes
1,India,ACME Hybrid SECI III wind farm,,,45.0,,Onshore,pre-construction,,ACME Solar Holdings PVT LTD,...,,,,,,,Yes,No,Yes,Yes
2,India,AMP Hybrid SECI III wind farm,,,65.0,,Onshore,pre-construction,,AMP Energy,...,,18.82,,,,,Yes,No,Yes,Yes
3,India,AMP Hybrid SECI XII wind farm,,,60.0,,Onshore,pre-construction,,AMP Energy,...,,18.82,,,,,Yes,No,Yes,Yes
6,India,Aeolus 1 wind farm,,,24.0,,Onshore,operating,,Ecoren Energy,...,,29.40,,,,,Yes,No,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6715,India,Yelisirur wind farm,,,26.0,,Onshore,operating,2012.0,Bhoruka Power CORP LTD,...,,30.01,,,,,No,No,Yes,Yes
6716,India,Yelisirur wind farm,,,42.0,,Onshore,operating,,Bhoruka Power CORP LTD,...,,30.01,,,,,Yes,No,Yes,Yes
6720,India,Yermala wind farm,,,149.0,,Onshore,operating,,CLP India PVT LTD,...,2.41,24.62,,,,,Yes,No,Yes,Yes
6726,India,Zaheerabad wind farm,,,32.0,,Onshore,operating,,Hero Group,...,,32.99,,,,,Yes,No,Yes,Yes
