In [10]:
import pandas as pd

# Step 1: Load the CSV file into a DataFrame
file_path = '/Users/annelise/Documents/GitHub/Wine_tasting_KG/data_kaggle/'
wine_data = pd.read_csv(file_path + 'winemag-data-130k-v2.csv')

# Display the shape of the DataFrame
print("Original data shape:", wine_data.shape)
#print(wine_data.head(5))

# Step 2: Data Cleaning and Preprocessing

# Define the columns to keep
columns_to_keep = ['country', 'description', 'points', 'price', 'title', 'province', 'variety', 'winery','taster_name']
wine_data_sel = wine_data[columns_to_keep]
print("Selected data shape:", wine_data_sel.shape)

# Check for missing values
missing_values = wine_data_sel.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Drop rows where any of the specified columns have missing values
wine_data_cleaned = wine_data_sel.dropna()
print("Shape of cleaned data (after dropping NAs):", wine_data_cleaned.shape)

# Identify duplicates based on key wine characteristics (excluding 'description' initially)
duplicate_criteria = ['country', 'province', 'variety', 'winery', 'price', 'taster_name', 'title']
duplicates = wine_data_cleaned[wine_data_cleaned.duplicated(subset=duplicate_criteria, keep=False)]

duplicates_sorted = duplicates.sort_values(by=duplicate_criteria)
grouped_duplicates = duplicates_sorted.groupby(duplicate_criteria)
print('duplicates snapshot', grouped_duplicates.head(10))

# Initialize counters for both cases
perfect_match_count = 0
different_description_count = 0

# Lists to store data for merging
to_merge = []

if not duplicates.empty:
    print("Found duplicates based on key wine characteristics:")
    duplicates_sorted = duplicates.sort_values(by=duplicate_criteria)
    duplicates_grouped = duplicates_sorted.groupby(duplicate_criteria)

    # Process each group of duplicates
    for name, group in duplicates_grouped:
        if len(group) > 1:
            descriptions = group['description'].unique()
            if len(descriptions) == 1:
                # Perfect match case
                perfect_match_count += len(group) - 1  # Count duplicates
            else:
                # Different description case
                different_description_count += len(group) - 1  # Count duplicates
                merged_description = " ".join(group['description'])
                merged_row = {
                    'country': group['country'].iloc[0],
                    'province': group['province'].iloc[0],
                    'variety': group['variety'].iloc[0],
                    'winery': group['winery'].iloc[0],
                    'price': group['price'].iloc[0],
                    'points': group['points'].mean(),
                    'description': merged_description
                }
                to_merge.append(merged_row)
    
    # Remove the original duplicates from the cleaned data
    wine_data_cleaned = wine_data_cleaned.drop_duplicates(subset=duplicate_criteria, keep=False)

    # Add the merged rows back into the cleaned data
    merged_df = pd.DataFrame(to_merge)
    wine_data_cleaned = pd.concat([wine_data_cleaned, merged_df], ignore_index=True)

    print("Shape of cleaned data (after combining duplicates):", wine_data_cleaned.shape)

else:
    print("No duplicates found based on the specified criteria.")

# Final shape of the cleaned dataset
print("Final shape of cleaned data:", wine_data_cleaned.shape)

# Report the number of perfect match duplicates and different description cases
print(f"Number of perfect match duplicates: {perfect_match_count}")
print(f"Number of different description duplicates (merged): {different_description_count}")

#Save the cleaned and processed data to a new CSV file
wine_data_cleaned.to_csv(file_path + 'cleaned_wine_data.csv', index=False)


Original data shape: (129971, 14)
Selected data shape: (129971, 9)
Missing values in each column:
 country           63
description        0
points             0
price           8996
title              0
province          63
variety            1
winery             0
taster_name    26244
dtype: int64
Shape of cleaned data (after dropping NAs): (96420, 9)
          country                                        description  points  \
79319   Argentina  Ripe aromas of blackberry and black currant co...      89   
82050   Argentina  Ripe aromas of blackberry and black currant co...      89   
35544   Argentina  Ripe raisiny aromas of spiced fruit cake are h...      87   
70721   Argentina  Ripe raisiny aromas of spiced fruit cake are h...      87   
700     Argentina  Intriguing aromas of lavender, hoja santa and ...      91   
...           ...                                                ...     ...   
72049     Uruguay  Rusty in color, this maturing Tannat-led blend...      87   
1110

ZeroDivisionError: division by zero