<a href="https://colab.research.google.com/github/adhithyyaa/23BCS050_DATA_PROCESSING_CHALLENGE/blob/main/23BCS050_DATA_PROCESSING_CHALLENGE_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark kagglehub



In [None]:
import kagglehub


path = kagglehub.dataset_download("shwetabh123/mall-customers")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'mall-customers' dataset.
Path to dataset files: /kaggle/input/mall-customers


In [None]:
spark = SparkSession.builder \
    .appName("DataPreprocessingChallenge") \
    .getOrCreate()

print("✅ Spark session created successfully!")


✅ Spark session created successfully!


In [None]:
file_path = f"{path}/Mall_Customers.csv"


data = spark.read.csv(file_path, header=True, inferSchema=True)

print("✅ Dataset loaded successfully.")
data.printSchema()
data.show(5)

✅ Dataset loaded successfully.
root
 |-- CustomerID: integer (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)

+----------+------+---+------------------+----------------------+
|CustomerID| Genre|Age|Annual Income (k$)|Spending Score (1-100)|
+----------+------+---+------------------+----------------------+
|         1|  Male| 19|                15|                    39|
|         2|  Male| 21|                15|                    81|
|         3|Female| 20|                16|                     6|
|         4|Female| 23|                16|                    77|
|         5|Female| 31|                17|                    40|
+----------+------+---+------------------+----------------------+
only showing top 5 rows



In [None]:

missing_counts = data.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data.columns])
print("🔍 Missing value count per column:")
missing_counts.show()


numeric_cols = [c for c, t in data.dtypes if t in ['double', 'int']]
for column in numeric_cols:
    mean_value = data.select(mean(col(column))).collect()[0][0]
    data = data.fillna({column: mean_value})


categorical_cols = [c for c, t in data.dtypes if t == 'string']
for column in categorical_cols:
    data = data.fillna({column: 'Unknown'})

print("✅ Missing values handled.")

🔍 Missing value count per column:
+----------+-----+---+------------------+----------------------+
|CustomerID|Genre|Age|Annual Income (k$)|Spending Score (1-100)|
+----------+-----+---+------------------+----------------------+
|         0|    0|  0|                 0|                     0|
+----------+-----+---+------------------+----------------------+

✅ Missing values handled.


In [None]:
for column in numeric_cols:
    data = data.withColumn(column, col(column).cast("double"))

data.printSchema()
print("✅ Data types standardized.")


root
 |-- CustomerID: double (nullable = true)
 |-- Genre: string (nullable = false)
 |-- Age: double (nullable = true)
 |-- Annual Income (k$): double (nullable = true)
 |-- Spending Score (1-100): double (nullable = true)

✅ Data types standardized.


In [None]:
before = data.count()
data = data.dropDuplicates()
after = data.count()

print(f"🧹 Removed {before - after} duplicate rows.")


🧹 Removed 0 duplicate rows.


In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler


string_cols = [f.name for f in data.schema.fields if f.dataType.simpleString() == 'string']
print("🧩 String columns:", string_cols)


for col in string_cols:
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_indexed")
    data = indexer.fit(data).transform(data)


numeric_cols = [f.name for f in data.schema.fields if f.dataType.simpleString() != 'string']
numeric_cols = [c for c in numeric_cols if c not in ['Time', 'Class']]

print("🔢 Numeric columns for features:", numeric_cols)


assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
assembled = assembler.transform(data)


scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(assembled)
scaled_data = scaler_model.transform(assembled)

print("✅ Data normalization completed successfully.")
scaled_data.select("scaled_features").show(5, truncate=False)


🧩 String columns: ['Genre']
🔢 Numeric columns for features: ['CustomerID', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Genre_indexed']
✅ Data normalization completed successfully.
+-------------------------------------------------------------------------------------------------------+
|scaled_features                                                                                        |
+-------------------------------------------------------------------------------------------------------+
|[-0.12958026383720403,0.6550214902764239,-0.021321376171336143,-0.04646926222575693,1.1253282349446478]|
|[-1.1662223745348363,1.012956730864634,-1.049316298717896,-1.7890665956916274,1.1253282349446478]      |
|[-1.0971129004883273,0.22549920157057177,-1.011242412697653,-1.2856495882459316,-0.8841864703136518]   |
|[-0.6306239506743929,1.084543778982276,-0.5162818944344946,0.34077458965554763,-0.8841864703136518]    |
|[-0.8033976357906649,1.4424790195704862,-0.6685774385154664,0.379

In [None]:

clean_data = scaled_data.drop("features", "scaled_features")


output_path = "/content/cleaned_creditcard_data.csv"


clean_data.write.csv(output_path, header=True, mode="overwrite")

print(f"✅ Cleaned dataset saved successfully at: {output_path}")


✅ Cleaned dataset saved successfully at: /content/cleaned_creditcard_data.csv


In [None]:
print("📊 Data Preprocessing Summary")
print("- Missing values handled (numeric: mean, categorical: 'Unknown')")
print("- Data types standardized to double")
print("- Duplicates removed")
print("- Features normalized using StandardScaler")
print("- Engineered features: Transaction_Hour, Amount_Category")
print("✅ Dataset ready for downstream analytics or ML tasks.")


📊 Data Preprocessing Summary
- Missing values handled (numeric: mean, categorical: 'Unknown')
- Data types standardized to double
- Duplicates removed
- Features normalized using StandardScaler
- Engineered features: Transaction_Hour, Amount_Category
✅ Dataset ready for downstream analytics or ML tasks.
