In [1]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [2]:
spark = SparkSession.builder \
    .appName("CoffeeData") \
    .master("local[*]") \
    .getOrCreate()

spark

In [3]:
df = pd.read_csv(
    r"C:\Users\HP\Desktop\Coffee\data\cleaned\coffee-cleaned.csv",
    sep=';',             
    encoding='utf-8',   
    header =0
)
print(df.shape)
df.head()
print(type(df))

(2188, 8)
<class 'pandas.core.frame.DataFrame'>


In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

In [6]:
features = [
    "price_per_100g",
    "total_score",
    "agtron_roast",
    "agtron_ground"
]

df_subset = df[features]

In [7]:
num_cols = ["price_per_100g", "total_score", "agtron_roast", "agtron_ground"]

In [9]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols)
    ]
)

In [10]:
pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("imputer", KNNImputer(n_neighbors=3))
])

In [11]:
imputed_array = pipeline.fit_transform(df_subset)

In [12]:
imputed_df = pd.DataFrame(imputed_array, columns=pipeline.named_steps["preprocess"].get_feature_names_out())

In [13]:
df["price_per_100g"] = imputed_df.filter(regex="price_per_100g").iloc[:,0]

In [14]:
print(df["price_per_100g"].isna().sum())

0


In [15]:
df.tail(20)

Unnamed: 0,coffee_name,total_score,roast_level,agtron_ground,agtron_roast,price_per_100g,roaster_country,origin_country
2168,Nicaragua El Progresso COE Lot #1,95,Medium Light,55,71,11.126667,USA,Nicaragua
2169,Kenya Mamuto Kirinyaga,96,Medium,52,65,7.39,USA,Kenya
2170,Sumatra Lake Tawar,94,Medium Dark,46,54,134.6,USA,Indonesia
2171,Sumatra Lake Tawar,94,Medium Dark,42,51,76.726667,USA,Indonesia
2172,Belle Espresso,94,Medium Dark,43,49,4.876667,USA,Ethiopia
2173,Colombia Pitalito Estate,94,Medium,56,63,7.52,USA,Colombia
2174,"Hamma Cooperative Yirgacheffe, Fair-Trade Organic",95,Medium Light,54,73,51.333333,USA,Ethiopia
2175,Sumatra Golden Mandheling,94,Medium,51,68,122.503333,USA,Indonesia
2176,Guatemala Antigua – Finca La Tacita,94,Medium,51,64,75.603333,USA,Guatemala
2177,Kenya AA Nyeri Fine Cup,94,Dark,35,43,205.91,USA,Kenya


In [16]:
df["price_per_100g"] = df["price_per_100g"].round(2)

In [17]:
df.tail(20)

Unnamed: 0,coffee_name,total_score,roast_level,agtron_ground,agtron_roast,price_per_100g,roaster_country,origin_country
2168,Nicaragua El Progresso COE Lot #1,95,Medium Light,55,71,11.13,USA,Nicaragua
2169,Kenya Mamuto Kirinyaga,96,Medium,52,65,7.39,USA,Kenya
2170,Sumatra Lake Tawar,94,Medium Dark,46,54,134.6,USA,Indonesia
2171,Sumatra Lake Tawar,94,Medium Dark,42,51,76.73,USA,Indonesia
2172,Belle Espresso,94,Medium Dark,43,49,4.88,USA,Ethiopia
2173,Colombia Pitalito Estate,94,Medium,56,63,7.52,USA,Colombia
2174,"Hamma Cooperative Yirgacheffe, Fair-Trade Organic",95,Medium Light,54,73,51.33,USA,Ethiopia
2175,Sumatra Golden Mandheling,94,Medium,51,68,122.5,USA,Indonesia
2176,Guatemala Antigua – Finca La Tacita,94,Medium,51,64,75.6,USA,Guatemala
2177,Kenya AA Nyeri Fine Cup,94,Dark,35,43,205.91,USA,Kenya


In [19]:
df.to_csv("filled_dataset.csv", index=False)