In [13]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [14]:
# Load the ingredients dataset from CSV
ingredients_df = pd.read_csv('C:/Users/celia/Documents/IronhackDataAnalysis/MidProject/combined_product_data.csv')

In [15]:
def handle_duplicates(df):
    """
    Identify and handle duplicate rows in the dataframe by removing them.
    
    Args:
    df (pd.DataFrame): The input dataframe with potential duplicate rows.
    
    Returns:
    pd.DataFrame: The dataframe with duplicates handled and index reset.
    """
    # Check for duplicated values
    duplicates = df.duplicated()  # Identify duplicated values
    number_of_duplicates = duplicates.sum()
    
    # Print the number of duplicated rows
    print(f"Number of duplicated rows before cleaning: {number_of_duplicates}")

    # Remove duplicates and reset index
    df_cleaned = df.drop_duplicates().reset_index(drop=True)
    
    # Check for duplicates after cleaning
    duplicates_after = df_cleaned.duplicated().sum()
    print(f"Number of duplicated rows after cleaning: {duplicates_after}")

    return df_cleaned

In [16]:
def handle_null_values(df):
    """
    Handle null values in the dataframe by filling them with appropriate statistics.
    
    Args:
    df (pd.DataFrame): The input dataframe with potential null values.
    
    Returns:
    pd.DataFrame: The dataframe with null values handled.
    """
    # Count the number of null values in each column
    print("Number of null values in each column before handling:")
    print(df.isna().sum())

    # Drop rows where all columns are NaN
    df = df.dropna(how='all')

    # Check if there are any remaining null values
    remaining_nulls = df.isnull().sum()
    print("\nNumber of null values in each column after handling:")
    print(remaining_nulls[remaining_nulls > 0])

    return df

In [17]:
#Shape and format of the dataframe obtain from webscrapping
print(ingredients_df.shape)
print("Number of rows:", ingredients_df.shape[0]) # Number of rows
print("Number of columns:", ingredients_df.shape[1]) # Number of columns

(5146, 10)
Number of rows: 5146
Number of columns: 10


In [12]:
ingredients_df = handle_null_values(ingredients_df)
ingredients_df_cleaned = handle_duplicates(ingredients_df)

Number of null values in each column before handling:
subcategory1       1
subcategory2       5
subcategory3      37
product_name      98
brand_name         1
rating          1245
review_count      99
price             58
ingredients      274
product_url        0
dtype: int64

Number of null values in each column after handling:
subcategory1       1
subcategory2       5
subcategory3      37
product_name      98
brand_name         1
rating          1245
review_count      99
price             58
ingredients      274
dtype: int64
Number of duplicated rows before cleaning: 489
Number of duplicated rows after cleaning: 0


In [18]:

# Load the ingredients dataset from CSV
ingredients_df = pd.read_csv('C:/Users/celia/Documents/IronhackDataAnalysis/MidProject/combined_product_data.csv')

# Load the endocrine disruptors dataset from CSV
endocrine_disruptors_df = pd.read_csv('C:/Users/celia/Documents/IronhackDataAnalysis/MidProject/endocrine-disruptor-assessment-echa_ingredient_name.csv')

# Assuming the disruptors are in a column named 'disruptors'
endocrine_disruptors = endocrine_disruptors_df['ingredient_name'].tolist()

# Function to find disruptors in the ingredients
def find_disruptors(ingredients, disruptors):
    found = []
    for disruptor in disruptors:
        if process.extractOne(disruptor, ingredients.split(', '), scorer=fuzz.partial_ratio)[1] > 90:  # Adjust the threshold as needed
            found.append(disruptor)
    return found

# Asegúrate de que todos los valores en 'ingredients' son cadenas de texto
ingredients_df['ingredients'] = ingredients_df['ingredients'].astype(str)

# Apply the function to the DataFrame
ingredients_df['disruptors'] = ingredients_df['ingredients'].apply(lambda x: find_disruptors(x, endocrine_disruptors))
ingredients_df['has_disruptors'] = ingredients_df['disruptors'].apply(lambda x: len(x) > 0)
ingredients_df['count_disruptors'] = ingredients_df['disruptors'].apply(len)

# Check the result
print(ingredients_df)

KeyboardInterrupt: 