<a href="https://colab.research.google.com/github/alexandergribenchenko/Data_Science_Self_Study/blob/main/01_Prog_Ori_Obj_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformador personalizado multicolumna para Pyspark

# A. Instalción y carga de Pyspark

In [1]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('test_pyspark').getOrCreate()

# B. Librerías de trabajo

In [4]:
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, Param, Params, TypeConverters
from pyspark import keyword_only
from pyspark.ml import Pipeline, PipelineModel

import pyspark.sql.functions as F
from pyspark.sql.types import StringType,BooleanType,DateType, FloatType

# C. Montaje de Google Drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 01. Carga del dataset raw

In [6]:
location_input = '/content/drive/MyDrive/01_Code/16_Pipelines/'
input_name = 'titanic_train.csv'

In [7]:
df_raw = spark.read\
     .format('csv')\
     .option('header', 'true')\
     .option('sep',',')\
     .option('mode', 'DROPMALFORMED')\
     .load(location_input+ input_name).limit(10)

In [8]:
df_raw.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [9]:
df_raw.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



# 02. Transformadores

## 02.01. FeatureSelector

In [10]:
class FeatureSelector(Transformer, HasInputCols):
  
  @keyword_only
  def __init__(self, inputCols=None):
    super().__init__()
    kwargs = self._input_kwargs
    self.setParams(**kwargs)
    
  @keyword_only
  def setParams(self, inputCols=None):
      kwargs = self._input_kwargs
      return self._set(**kwargs)

  def setInputCol(self, new_inputCols):
    return self.setParams(inputCols=new_inputCols)
  
  def _transform(self, dataset):
    input_columns = (self.getInputCols() if self.isSet("inputCols") else self.getInputCols())
    answer = dataset.select(*input_columns)
    return answer

In [11]:
columns_selected = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare','Survived']

In [12]:
Transformer_FeatureSelector = FeatureSelector(inputCols=columns_selected)

In [13]:
df_transformed_01 = Transformer_FeatureSelector.transform(df_raw)
df_transformed_01.show()

+-----------+------+------+----+-------+--------+
|PassengerId|Pclass|   Sex| Age|   Fare|Survived|
+-----------+------+------+----+-------+--------+
|          1|     3|  male|  22|   7.25|       0|
|          2|     1|female|  38|71.2833|       1|
|          3|     3|female|  26|  7.925|       1|
|          4|     1|female|  35|   53.1|       1|
|          5|     3|  male|  35|   8.05|       0|
|          6|     3|  male|null| 8.4583|       0|
|          7|     1|  male|  54|51.8625|       0|
|          8|     3|  male|   2| 21.075|       0|
|          9|     3|female|  27|11.1333|       1|
|         10|     2|female|  14|30.0708|       1|
+-----------+------+------+----+-------+--------+



In [14]:
df_raw.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

## 02.02. TypeAssignatorFloat

In [15]:
class TypeAssignatorFloat(Transformer, HasInputCols):
  
  @keyword_only
  def __init__(self, inputCols=None):
    super().__init__()
    kwargs = self._input_kwargs
    self.setParams(**kwargs)
    
  @keyword_only
  def setParams(self, inputCols=None):
      kwargs = self._input_kwargs
      return self._set(**kwargs)

  def setInputCol(self, new_inputCols):
    return self.setParams(inputCols=new_inputCols)
  
  def _transform(self, dataset):
    input_columns = (self.getInputCols() if self.isSet("inputCols") else self.getInputCols())
    answer = dataset
    for col_i in input_columns: 
      answer=answer.withColumn(col_i,F.col(col_i).cast(FloatType()))
    return answer

In [16]:
df_transformed_01.show()

+-----------+------+------+----+-------+--------+
|PassengerId|Pclass|   Sex| Age|   Fare|Survived|
+-----------+------+------+----+-------+--------+
|          1|     3|  male|  22|   7.25|       0|
|          2|     1|female|  38|71.2833|       1|
|          3|     3|female|  26|  7.925|       1|
|          4|     1|female|  35|   53.1|       1|
|          5|     3|  male|  35|   8.05|       0|
|          6|     3|  male|null| 8.4583|       0|
|          7|     1|  male|  54|51.8625|       0|
|          8|     3|  male|   2| 21.075|       0|
|          9|     3|female|  27|11.1333|       1|
|         10|     2|female|  14|30.0708|       1|
+-----------+------+------+----+-------+--------+



In [17]:
df_transformed_01.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Survived: string (nullable = true)



In [18]:
Transformer_TypeAssignatorFloat = TypeAssignatorFloat(inputCols=['Age','Fare'])

In [19]:
df_transformed_02 = Transformer_TypeAssignatorFloat.transform(df_transformed_01)
df_transformed_02.show()

+-----------+------+------+----+-------+--------+
|PassengerId|Pclass|   Sex| Age|   Fare|Survived|
+-----------+------+------+----+-------+--------+
|          1|     3|  male|22.0|   7.25|       0|
|          2|     1|female|38.0|71.2833|       1|
|          3|     3|female|26.0|  7.925|       1|
|          4|     1|female|35.0|   53.1|       1|
|          5|     3|  male|35.0|   8.05|       0|
|          6|     3|  male|null| 8.4583|       0|
|          7|     1|  male|54.0|51.8625|       0|
|          8|     3|  male| 2.0| 21.075|       0|
|          9|     3|female|27.0|11.1333|       1|
|         10|     2|female|14.0|30.0708|       1|
+-----------+------+------+----+-------+--------+



In [20]:
df_transformed_02.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: float (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Survived: string (nullable = true)



## 02.03. LogScaler

In [21]:
class LogScaler(Transformer, HasInputCols):
  
  @keyword_only
  def __init__(self, inputCols=None):
    super().__init__()
    kwargs = self._input_kwargs
    self.setParams(**kwargs)
    
  @keyword_only
  def setParams(self, inputCols=None):
      kwargs = self._input_kwargs
      return self._set(**kwargs)

  def setInputCol(self, new_inputCols):
    return self.setParams(inputCols=new_inputCols)
  
  def _transform(self, dataset):
    input_columns = (self.getInputCols() if self.isSet("inputCols") else self.getInputCols())
    answer = dataset
    for col_i in input_columns: 
      answer=answer.withColumn(col_i, F.log(F.col(col_i)))
    return answer

In [22]:
df_transformed_02.show()

+-----------+------+------+----+-------+--------+
|PassengerId|Pclass|   Sex| Age|   Fare|Survived|
+-----------+------+------+----+-------+--------+
|          1|     3|  male|22.0|   7.25|       0|
|          2|     1|female|38.0|71.2833|       1|
|          3|     3|female|26.0|  7.925|       1|
|          4|     1|female|35.0|   53.1|       1|
|          5|     3|  male|35.0|   8.05|       0|
|          6|     3|  male|null| 8.4583|       0|
|          7|     1|  male|54.0|51.8625|       0|
|          8|     3|  male| 2.0| 21.075|       0|
|          9|     3|female|27.0|11.1333|       1|
|         10|     2|female|14.0|30.0708|       1|
+-----------+------+------+----+-------+--------+



In [23]:
df_transformed_02.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: float (nullable = true)
 |-- Fare: float (nullable = true)
 |-- Survived: string (nullable = true)



In [24]:
Transformer_LogScaler = LogScaler(inputCols=['Age','Fare'])

In [25]:
df_transformed_03 = Transformer_LogScaler.transform(df_transformed_02)
df_transformed_03.show()

+-----------+------+------+------------------+------------------+--------+
|PassengerId|Pclass|   Sex|               Age|              Fare|Survived|
+-----------+------+------+------------------+------------------+--------+
|          1|     3|  male| 3.091042453358316|1.9810014688665833|       0|
|          2|     1|female|3.6375861597263857| 4.266662110752839|       1|
|          3|     3|female| 3.258096538021482|2.0700223438308347|       1|
|          4|     1|female|3.5553480614894135|3.9721768995119433|       1|
|          5|     3|  male|3.5553480614894135|2.0856721151242437|       0|
|          6|     3|  male|              null|2.1351481648641313|       0|
|          7|     1|  male|3.9889840465642745| 3.948595970948175|       0|
|          8|     3|  male|0.6931471805599453| 3.048087540089079|       0|
|          9|     3|female| 3.295836866004329|2.4099406018108263|       1|
|         10|     2|female|2.6390573296152584|3.4035546272161863|       1|
+-----------+------+-----

# 03. Pipelines

## 03.01. Pipeline

In [26]:
pipeline_my = Pipeline(stages=[
  Transformer_FeatureSelector,
  Transformer_TypeAssignatorFloat, 
  Transformer_LogScaler
])

In [27]:
type(pipeline_my)

pyspark.ml.pipeline.Pipeline

In [28]:
pipelineModel = pipeline_my.fit(df_raw)

In [29]:
type(pipelineModel)

pyspark.ml.pipeline.PipelineModel

In [30]:
dir(pipeline_my)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_copyValues',
 '_copy_params',
 '_defaultParamMap',
 '_dummy',
 '_fit',
 '_from_java',
 '_input_kwargs',
 '_paramMap',
 '_params',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_testOwnParam',
 '_to_java',
 'clear',
 'copy',
 'explainParam',
 'explainParams',
 'extractParamMap',
 'fit',
 'fitMultiple',
 'getOrDefault',
 'getParam',
 'getStages',
 'hasDefault',
 'hasParam',
 'isDefined',
 'isSet',
 'load',
 'params',
 'read',
 'save',
 'set',
 'setPa

In [31]:
dir(pipelineModel)

['__abstractmethods__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_copyValues',
 '_copy_params',
 '_defaultParamMap',
 '_dummy',
 '_from_java',
 '_paramMap',
 '_params',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_testOwnParam',
 '_to_java',
 '_transform',
 'clear',
 'copy',
 'explainParam',
 'explainParams',
 'extractParamMap',
 'getOrDefault',
 'getParam',
 'hasDefault',
 'hasParam',
 'isDefined',
 'isSet',
 'load',
 'params',
 'read',
 'save',
 'set',
 'stages',
 'transform',
 'uid',
 'write']

In [32]:
transformedDF = pipelineModel.transform(df_raw)

transformedDF.show()

+-----------+------+------+------------------+------------------+--------+
|PassengerId|Pclass|   Sex|               Age|              Fare|Survived|
+-----------+------+------+------------------+------------------+--------+
|          1|     3|  male| 3.091042453358316|1.9810014688665833|       0|
|          2|     1|female|3.6375861597263857| 4.266662110752839|       1|
|          3|     3|female| 3.258096538021482|2.0700223438308347|       1|
|          4|     1|female|3.5553480614894135|3.9721768995119433|       1|
|          5|     3|  male|3.5553480614894135|2.0856721151242437|       0|
|          6|     3|  male|              null|2.1351481648641313|       0|
|          7|     1|  male|3.9889840465642745| 3.948595970948175|       0|
|          8|     3|  male|0.6931471805599453| 3.048087540089079|       0|
|          9|     3|female| 3.295836866004329|2.4099406018108263|       1|
|         10|     2|female|2.6390573296152584|3.4035546272161863|       1|
+-----------+------+-----

In [33]:
type(df_raw)

pyspark.sql.dataframe.DataFrame

In [34]:
dir(df_raw)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collect_as_arrow',
 '_jcols',
 '_jdf',
 '_jmap',
 '_joinAsOf',
 '_jseq',
 '_lazy_rdd',
 '_repr_html_',
 '_sc',
 '_schema',
 '_session',
 '_sort_cols',
 '_sql_ctx',
 '_support_repr_html',
 '_to_corrected_pandas_type',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'colRegex',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 