# Load Dataset

In [2]:
import pandas as pd

# Load the dataset from CSV file
df = pd.read_csv("apple_quality.csv")

# Remove the string value from the Acidity column
df = df[df['Acidity'] != 'Created_by_Nidula_Elgiriyewithana']

# Turn Acidity column into a float
df['Acidity'] = pd.to_numeric(df['Acidity'])

# Display column names, non-null count, and data type
# Display row count and column count
df.info()
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'apple_quality.csv'

# Raw Data Distribution

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Plot histograms for some of the features
features_to_plot = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[feature], kde=True)
    plt.title(feature)

plt.tight_layout()
plt.show()

print("\nmean:\n{} , \nmedian:\n{} , \nstd:\n {}".format(df[features_to_plot].mean() , df[features_to_plot].median(), df[features_to_plot].std()))

# Standardization without Removing Outliers

In [None]:
# Assign 'Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity' features to X
X = df.loc[:, ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']]

# Normalizing the independent variables using z-score normalization
data_mean = X.mean()
data_std = X.std()

# Perform z-score normalization
normalized_X_zscore = (X - data_mean) / data_std

# Plot histograms for some of the features
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(3, 3, i)
    sns.histplot(normalized_X_zscore[feature], kde=True)
    plt.title(feature)

plt.tight_layout()
plt.show()


print("\nmean:\n{} , \nmedian:\n{} , \nstd:\n {}".format(normalized_X_zscore.mean() , normalized_X_zscore.median(), normalized_X_zscore.std()))


# Finding Outliers

In [None]:
# Compute the IQR Q1, Q2, Q3, IQR, min and max. Visualize with a boxplot
Q1 = X.quantile(0.25)
Q2 = X.quantile(0.5)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
min = X.min()
max = X.max()

print("\nQ1:\n{} , \nQ2:\n{} , \nQ3:\n{} , \nIQR:\n{} , \nmin:\n{} , \nmax:\n{}".format(Q1, Q2, Q3, IQR, min, max))

# print out the number outliers that were found
outliers = (X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))
print("\nOutliers:\n", outliers.sum())

In [None]:
# Use the boxplot method to plot all numeric features, this gives outliers per feature
df[['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']].boxplot()
plt.title("Boxplot of All Features")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculating lower and upper limit for determining outliers

In [None]:
# Calculate the boundaries for each feature
def boundaries(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    print(feature)
    print(lower_limit, upper_limit)
    outliers = df[(df[feature] < lower_limit) | (df[feature] > upper_limit)]
    #print(outliers)

# Find the boundaries for the 7 features
for i in df[['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']]:
    boundaries(df, i)

# Removing Outliers

In [None]:
def remove_outliers(df, features):
    # Initialize a mask that starts as all True
    mask = pd.Series([True] * len(df))
    
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        # Update the mask to exclude outliers for the current feature
        mask &= (df[feature] >= lower_limit) & (df[feature] <= upper_limit)

    # Return the DataFrame without outliers
    return df[mask]

# Define the features
features = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']

# Get the cleaned DataFrame
cleaned_df = remove_outliers(df, features)

cleaned_df.info()


# Boxplot after Removing Outliers

In [None]:
# Use the boxplot method to plot all numeric features, this gives outliers per feature
cleaned_df[['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']].boxplot()
plt.title("Boxplot of All Features after Removing Outliers")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# I get similar results using Marina's code

# First let's see how many rows we have before dropping outliers
print(f"Original number of rows: {len(df)}")

# Calculate outlier boundaries and create a mask for non-outlier rows
features = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']
mask = pd.Series(True, index=df.index)  # Start with all True

for feature in features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers for this feature
    feature_outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    print(f"\n{feature}:")
    print(f"Number of outliers: {len(feature_outliers)}")
    print(f"Outlier boundaries: [{lower_bound:.2f}, {upper_bound:.2f}]")
    
    # Update mask to exclude outliers
    mask = mask & (df[feature] >= lower_bound) & (df[feature] <= upper_bound)

# Create new dataframe without outliers, to see the diff
df_cleaned = df[mask]

print(f"\nNumber of rows after removing outliers: {len(df_cleaned)}")
print(f"Total rows removed: {len(df) - len(df_cleaned)}")

# Plot boxplots of cleaned data
plt.figure(figsize=(12, 6))
df_cleaned[features].boxplot()
plt.title("Boxplot of All Features After Removing Outliers")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Standardization after Removing Outliers

In [None]:
# Assign 'Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity' features to X_without_outlier
X_without_outlier = cleaned_df.loc[:, ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']]

# Normalizing the independent variables using z-score normalization
data_mean_without_outlier = X_without_outlier.mean()
data_std_without_outlier = X_without_outlier.std()

# Perform z-score normalization
normalized_X_zscore_without_outlier = (X_without_outlier - data_mean_without_outlier) / data_std_without_outlier

# Plot histograms for some of the features
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(3, 3, i)
    sns.histplot(normalized_X_zscore_without_outlier[feature], kde=True)
    plt.title(feature)

plt.tight_layout()
plt.show()


print("\nmean:\n{} , \nmedian:\n{} , \nstd:\n {}".format(normalized_X_zscore_without_outlier.mean() , normalized_X_zscore_without_outlier.median(), normalized_X_zscore_without_outlier.std()))


# EDA

## Pairplot

In [None]:
# drop A_id from df because it is not relevant for the pairplot
df_eda = df.iloc[:,1:]

# visualize new dataframe
df_eda.head()


# Hue is set to quality, meaning that points in the scatterplot and histogram are color-coded by the quality
# good quality = blue, bad quality = orange
custom_colors = {
    'good': 'blue',
    'bad': 'orange'
}

sns.pairplot(data=df_eda, hue='Quality', palette=custom_colors)

plt.show()

## Correlation Matrix

In [3]:
# select only numeric features from the dataset for the correlation matrix
corr_features = df_eda.select_dtypes(include=['int64', 'float64'])

# use pearson correlation as data is continuous and appears normally distributed
corr_matrix = corr_features.corr(method='pearson')

print("Correlation Matrix\n", corr_matrix)

plt.figure(figsize=(10,10))
sns.heatmap(corr_matrix, cmap='coolwarm', square=True, annot=True)
plt.title('Correlation Matrix Heatmap')
plt.show()

NameError: name 'df_eda' is not defined

## Skewness Analysis using Histograms

In [None]:

# Calculate skewness for each feature to understand the distribution shape
skewness = df[features_to_plot].skew()
print("Skewness for each feature:\n", skewness)


In [None]:
# Plotting skewness using histograms for better visualization
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[feature], kde=True)
    plt.title(f"{feature} (Skewness: {skewness[feature]:.2f})")

plt.tight_layout()
plt.show()


## Variable Type Counts

In [None]:

# Determine if there are any categorical variables, their types, and counts of unique values if applicable
unique_counts = df.nunique()
print("\nUnique counts for each feature (number of distinct values):\n", unique_counts)

# If there are categorical features, plot their counts to see the distribution across categories
categorical_features = df.select_dtypes(include=['object']).columns  # Identify categorical columns if any

for feature in categorical_features:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=feature)
    plt.title(f"Count Plot for {feature}")
    plt.xticks(rotation=45)
    plt.show()

## Q-Q Plot

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

for feature in features:
    plt.figure()
    stats.probplot(df[feature], dist="norm", plot=plt)
    plt.title(f'Q-Q plot for {feature}')
    plt.show()