In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import pyspark

In [2]:
# start a pyspark session

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('AML').getOrCreate()

24/03/26 21:13:50 WARN Utils: Your hostname, flame-ASUS-TUF-Gaming-A15-FA506IC-FA506IC resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp3s0)
24/03/26 21:13:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/26 21:13:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/26 21:13:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

In [5]:
# check the number of active clusters
sc = spark.sparkContext
num_clusters = len(sc.parallelize(range(0, 10), 10).glom().collect())

# Print the number of clusters
print("Number of clusters:", num_clusters)

[Stage 0:>                                                        (0 + 10) / 10]

Number of clusters: 10


                                                                                

# Read the Files

In [6]:
# Read in the Training Data (# avoid inferschema right now to save load time)
df_train_data = spark.read.csv('train_data.csv', header=True)
df_train_labels = spark.read.csv('train_labels.csv', header=True)

In [8]:
# function to see the shape of the dataset
def get_shape(df):
    num_rows = df.count()
    num_cols = len(df.columns)
    return num_rows, num_cols

In [9]:
get_shape(df_train_data)

                                                                                

(5531451, 190)

In [10]:
get_shape(df_train_labels)

(458913, 2)

In [15]:
print(df_train_data.columns)

['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_63', 'D_64', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'D_66', 'B_20', 'D_68', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'B_30', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_2

In [16]:
print(df_train_labels.columns)

['customer_ID', 'target']


# Sampling 20% of train Labels and Merging with Train Data to get dev Sample

In [13]:
df_labels_sample = df_train_labels.sample(fraction=0.2, seed=10)

In [14]:
# check shape to make sure we have 20% of the original data
get_shape(df_labels_sample)

(91633, 2)

In [17]:
# now, we merge this with the training data
df = df_labels_sample.join(df_train_data, on='customer_ID', how='inner')

In [18]:
# check the shape of the dev sample
get_shape(df)

                                                                                

(1103628, 191)

In [19]:
print(df.columns)

['customer_ID', 'target', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_63', 'D_64', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'D_66', 'B_20', 'D_68', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'B_30', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D

In [20]:
# sort the dataframe by customer_ID
df = df.orderBy('customer_ID')

# Export the DataFrame

In [22]:
spark.conf.set("spark.sql.debug.maxToStringFields", 200)

In [23]:
output_file = "dev.csv"
df.coalesce(1).write.csv(output_file, header=True)

                                                                                