**Step 1: Import libraries and load the dataset for preprocessing and analysis.**

In [158]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Load the CSV file into a pandas DataFrame
file_path = "D:\\archive\\df_arabica_clean.csv"
df = pd.read_csv(file_path)

**Step 2: One-hot encoding categorical features and updating the dataset**

In [161]:
data = pd.read_csv(file_path)

categorical_features = ["Country of Origin", "Region", "Variety", "Processing Method","Color","Altitude"]
categorical_features = [col for col in categorical_features if col in data.columns]


onehot_encoder = OneHotEncoder(handle_unknown='ignore')
onehot_encoded = onehot_encoder.fit_transform(data[categorical_features]) # applying one-hot encoding to the categorical features

onehot_columns = onehot_encoder.get_feature_names_out(categorical_features)
onehot_df = pd.DataFrame(onehot_encoded.toarray(), columns=onehot_columns)   # creates a dataframe with the encoded columns


data_encoded = pd.concat([data, onehot_df], axis=1)   # tries to concatenate the encoded columns with the original dataframe



print(data_encoded.head())
data_encoded.to_csv("coffee_quality_encoded.csv", index=False) 

   Unnamed: 0  ID Country of Origin                 Farm Name  \
0           0   0          Colombia          Finca El Paraiso   
1           1   1            Taiwan  Royal Bean Geisha Estate   
2           2   2              Laos        OKLAO coffee farms   
3           3   3        Costa Rica                 La Cumbre   
4           4   4          Colombia           Finca Santuario   

                                Lot Number                           Mill  \
0                               CQU2022015               Finca El Paraiso   
1  The 2022 Pacific Rim Coffee Summit,T037       Royal Bean Geisha Estate   
2  The 2022 Pacific Rim Coffee Summit,LA01  oklao coffee processing plant   
3                               CQU2022017        La Montana Tarrazu MIll   
4                               CQU2023002                Finca Santuario   

  ICO Number                   Company   Altitude                Region  ...  \
0        NaN      Coffee Quality Union  1700-1930        Piendamo,

**Step 3: Applying Standard Scaling to Numeric Features**

In [164]:

data_encoded = pd.read_csv("coffee_quality_encoded.csv")
numeric_features = ["Aroma", "Flavor", "Aftertaste", "Acidity", "Body", "Balance", "Sweetness", "Moisture Percentage","Total Cup Points","Overall"]


scaler = StandardScaler()
data_encoded[numeric_features] = scaler.fit_transform(data_encoded[numeric_features])   # applying scaling to the numeric features


print(data_encoded.head())
data_encoded.to_csv("coffee_quality_scaled.csv", index=False)

   Unnamed: 0  ID Country of Origin                 Farm Name  \
0           0   0          Colombia          Finca El Paraiso   
1           1   1            Taiwan  Royal Bean Geisha Estate   
2           2   2              Laos        OKLAO coffee farms   
3           3   3        Costa Rica                 La Cumbre   
4           4   4          Colombia           Finca Santuario   

                                Lot Number                           Mill  \
0                               CQU2022015               Finca El Paraiso   
1  The 2022 Pacific Rim Coffee Summit,T037       Royal Bean Geisha Estate   
2  The 2022 Pacific Rim Coffee Summit,LA01  oklao coffee processing plant   
3                               CQU2022017        La Montana Tarrazu MIll   
4                               CQU2023002                Finca Santuario   

  ICO Number                   Company   Altitude                Region  ...  \
0        NaN      Coffee Quality Union  1700-1930        Piendamo,

**Step 4: Computing Mutual Information Scores**

In [167]:
df = pd.read_csv(file_path)

selected_features = [
    'Unnamed: 0', 'ID', 'Number of Bags', 'Aroma', 'Flavor', 'Aftertaste',
    'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean Cup', 'Sweetness',
    'Overall', 'Defects', 'Total Cup Points', 'Moisture Percentage',
    'Category One Defects', 'Quakers', 'Category Two Defects'
]
df_selected = df[selected_features]
# Compute Mutual Information Scores for All Features
from sklearn.feature_selection import mutual_info_regression

# Compute mutual information scores for all features with the target variable 'Total Cup Points'
mi_scores = mutual_info_regression(df_selected.drop(columns=['Total Cup Points']), df_selected['Total Cup Points'])

mi_scores_df = pd.DataFrame({
    'Feature': df_selected.drop(columns=['Total Cup Points']).columns,
    'MI Score': mi_scores
}).sort_values(by='MI Score', ascending=False)

# Display the top features based on mutual information scores
mi_scores_df.head(10)

Unnamed: 0,Feature,MI Score
1,ID,3.238321
0,Unnamed: 0,3.234456
12,Overall,1.41648
8,Balance,1.13394
5,Aftertaste,1.114341
4,Flavor,1.037029
6,Acidity,0.823133
3,Aroma,0.78572
7,Body,0.631359
14,Moisture Percentage,0.132724


**Step 5: Feature Selection Based on Mutual Information Score & Mapping Continents, Categorizing Processing Methods and Colors, Classifying Altitudes**

In [172]:
df = pd.read_csv("coffee_quality_scaled.csv")

# Define the columns to keep
columns_to_keep = [
    'ID', 'Country of Origin', 'Processing Method', 
    'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 
    'Overall', 'Color', 'Altitude', 'Total Cup Points'
]

# Filter the DataFrame to keep only the specified columns
df_filtered = df[columns_to_keep].copy()

# Map countries to their respective continents
continent_mapping = {
    'Colombia': 'America', 'Taiwan': 'Asia', 'Laos': 'Asia', 'Costa Rica': 'America', 
    'Guatemala': 'America', 'Tanzania, United Republic Of': 'Africa', 'Ethiopia': 'Africa', 
    'Thailand': 'Asia', 'Brazil': 'America', 'United States (Hawaii)': 'America', 'Kenya': 'Africa', 
    'Uganda': 'Africa', 'Indonesia': 'Asia', 'Peru': 'America', 'Panama': 'America', 
    'Nicaragua': 'America', 'Vietnam': 'Asia', 'Honduras': 'America', 'El Salvador': 'America', 
    'Madagascar': 'Africa', 'Mexico': 'America', 'Myanmar': 'Asia'
}

# Map the 'Continent of Origin' column based on 'Country of Origin'
df_filtered['Continent of Origin'] = df_filtered['Country of Origin'].map(continent_mapping).fillna('Unknown')

# Define a function to categorize processing methods
def categorize_processing_method(method):
    method = str(method).strip().lower()
    if method in ['washed / wet', 'natural / dry', 'pulped natural / honey']:
        return method.title()
    else:
        return 'Other'

df_filtered['Processing Method'] = df_filtered['Processing Method'].apply(categorize_processing_method)

# Define a function to categorize colors into four groups
def final_categorize_color(color):
    if pd.isna(color):
        return 'Multicolored'
    color_lower = color.lower()
    if 'greenish' in color_lower and 'yellow' not in color_lower and 'brown' not in color_lower:
        return 'Green'
    elif 'yellowish' in color_lower and 'green' not in color_lower and 'brown' not in color_lower:
        return 'Yellow'
    elif 'brownish' in color_lower and 'green' not in color_lower and 'yellow' not in color_lower:
        return 'Brown'
    else:
        # All other cases, including combinations, will be categorized as 'Multicolored'
        return 'Multicolored'
df_filtered['Color'] = df_filtered['Color'].apply(final_categorize_color)

# Define a function to classify altitudes into specific ranges
def classify_altitude(altitude):
    try:
        if '-' in str(altitude):
            # Take the average of the range
            alt_values = [float(x) for x in altitude.split('-')]
            avg_altitude = np.mean(alt_values)
        else:
            avg_altitude = float(altitude)
        
        # Classify based on the given ranges
        if avg_altitude < 800:
            return 'Low Altitude (<800m)'
        elif 800 <= avg_altitude < 1200:
            return 'Medium-Low Altitude (800-1200m)'
        elif 1200 <= avg_altitude < 1600:
            return 'Medium Altitude (1200-1600m)'
        elif 1600 <= avg_altitude <= 2000:
            return 'High Altitude (1600-2000m)'
        else:
            return 'Very High Altitude (>2000m)'
    except (ValueError, TypeError):
        return 'Unknown'
df_filtered['Altitude'] = df_filtered['Altitude'].apply(classify_altitude)

# Export the filtered DataFrame to a CSV file
df_filtered.to_csv("preprocessed.csv", index=False)

print("Sucessfully preprocessed Data.")


Sucessfully preprocessed Data.
