In [5]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

### Data Collection ###

# Web Scraping
def fetch_web_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        return soup
    else:
        print(f"Error: {response.status_code}")
        return None

# CSV Data Collection
def fetch_csv_data(csv_url):
    """
    Fetches CSV data from a URL or local path.
    """
    df = pd.read_csv(csv_url)
    return df

### Data Transformation ###

def transform_dataframe(df):
    """
    This function handles:
    - Missing values
    - Renaming columns
    - Creating new features
    - Scaling numerical data
    - Aggregating data
    """
    # 1. Handle Missing Values
    df.fillna(method='ffill', inplace=True)  # Forward fill missing values
    df.fillna(0, inplace=True)  # Replace remaining NaNs with 0

    # Alternatively, drop rows/columns with missing values
    # df.dropna(axis=0, inplace=True)  # Drop rows with missing values
    
   
    # 3. Filtering Data
    # Example: Filter rows where a column value is greater than a threshold
    df = df[df['numeric_column'] > 100]
    
    # Example: Select specific columns
    df = df[['column_1', 'column_2', 'numeric_column']]
    
    # 4. Feature Engineering (Creating new columns)
    # Example: Create a new column based on existing columns
    df['new_feature'] = df['numeric_column'] * 1.5  # Simple transformation
    df['category'] = df['numeric_column'].apply(lambda x: 'High' if x > 200 else 'Low')  # Categorizing
    
    # 5. Aggregating Data
    # Example: Group by a column and calculate the sum of another column
    df_agg = df.groupby('category').agg({'numeric_column': 'sum'}).reset_index()

    # 6. Normalize or Scale Numerical Data
    # Example: Normalize numeric data using Min-Max Scaling
    df['scaled_numeric_column'] = (df['numeric_column'] - df['numeric_column'].min()) / (df['numeric_column'].max() - df['numeric_column'].min())

    return df

### Data Cleaning ###

def clean_data(df):
    """
    This function performs:
    - Removing duplicates
    - Handling invalid data with regex
    """
    # 1. Remove duplicates
    df.drop_duplicates(inplace=True)
    
    # 2. Remove rows with invalid data (e.g., rows with 'InvalidValue')
    df = df[df['column_name'] != "InvalidValue"]
    
    # 3. Cleaning text data with regex (e.g., removing special characters)
    df['text_column'] = df['text_column'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    
    return df

### Data Pipeline ###

def data_pipeline(csv_url=None):
    """
    Executes the entire pipeline:
    - Data collection (CSV)
    - Data transformation
    - Data cleaning
    - Data aggregation and saving
    """
    
    # Step 1: Data Collection (CSV)
    df_csv = fetch_csv_data(csv_url)
    
    # Step 2: Data Transformation
    df_transformed = transform_dataframe(df_csv)
    
    # Step 3: Data Cleaning
    df_cleaned = clean_data(df_transformed)
    
    # Step 4: Save Cleaned Data
    df_cleaned.to_csv("cleaned_transformed_data.csv", index=False)
    
    return df_cleaned

### Example Usage ###
csv_url = "CSV/installed-solar-pv-capacity.csv"

# Run the pipeline
final_data = data_pipeline(csv_url=csv_url)

# Show first few rows of the final cleaned data
print(final_data.head())


  df.fillna(method='ffill', inplace=True)  # Forward fill missing values


KeyError: 'numeric_column'