In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from sklearn import svm

creditos = pd.read_parquet('creditos.parquet', engine='pyarrow')
demografico = pd.read_parquet('demografico.parquet', engine='pyarrow')

spark = SparkSession \
    .builder \
    .appName("Challenge") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.sql("SELECT cr.customerid, cr.nombre_transaccion, cr.monto_transaccion, de.maritalstatus, de.gender, de.edad FROM parquet.`creditos.parquet` as cr INNER JOIN parquet.`demografico.parquet` as de ON cr.customerid == de.customerid ORDER BY cr.monto_transaccion DESC")
df.show()

+----------+------------------+-----------------+-------------+------+----+
|customerid|nombre_transaccion|monto_transaccion|maritalstatus|gender|edad|
+----------+------------------+-----------------+-------------+------+----+
|      5120|      SUPER AVANCE|       5370569.00|       CASADO|     M|72.0|
|     40214|      SUPER AVANCE|       5370569.00|       CASADO|     M|74.0|
|     87958|      SUPER AVANCE|       5370569.00|      SOLTERO|     F|72.0|
|     40214|      SUPER AVANCE|       5370569.00|       CASADO|     M|74.0|
|     23189|       AVANCE CAJA|       5370569.00|       CASADO|     F|71.0|
|     11271|      SUPER AVANCE|       5005371.00|       CASADO|     F|75.0|
|     92959|      SUPER AVANCE|       5000000.00|       CASADO|     M|75.0|
|     81587|      SUPER AVANCE|       5000000.00|       CASADO|     F|75.0|
|     93189|      SUPER AVANCE|       5000000.00|       CASADO|     M|73.0|
|     16227|      SUPER AVANCE|       5000000.00|       CASADO|     F|71.0|
|     60013|

In [82]:
pdf = df.toPandas()

pdf.loc[pdf['maritalstatus'] == 'SOLTERO', 'maritalstatus_map'] = 0
pdf.loc[pdf['maritalstatus'] == 'CASADO', 'maritalstatus_map'] = 1
pdf.loc[pdf['maritalstatus'] == 'VIUDO', 'maritalstatus_map'] = 2
pdf.loc[pdf['maritalstatus'] == 'NO INF', 'maritalstatus_map'] = 3

pdf.loc[pdf['gender'] == 'F', 'gender_map'] = 0
pdf.loc[pdf['gender'] == 'M', 'gender_map'] = 1

drop_elements = ['customerid', 'nombre_transaccion', 'maritalstatus', 'gender']

pdf.loc[pdf['nombre_transaccion'] == 'AVANCE CAJA', 'nombre_transaccion_map'] = 0
pdf.loc[pdf['nombre_transaccion'] == 'SUPER AVANCE', 'nombre_transaccion_map'] = 1

pdf_map = pdf.drop(drop_elements, axis = 1)
pdf_map

Unnamed: 0,monto_transaccion,edad,maritalstatus_map,gender_map,nombre_transaccion_map
0,5370569.00,71.0,1.0,0.0,0.0
1,5370569.00,72.0,0.0,0.0,1.0
2,5370569.00,72.0,1.0,1.0,1.0
3,5370569.00,74.0,1.0,1.0,1.0
4,5370569.00,74.0,1.0,1.0,1.0
5,5005371.00,75.0,1.0,0.0,1.0
6,5000000.00,70.0,1.0,0.0,1.0
7,5000000.00,71.0,1.0,0.0,0.0
8,5000000.00,75.0,1.0,1.0,0.0
9,5000000.00,71.0,1.0,0.0,1.0


In [111]:
pdf_map['monto_transaccion'] = pdf_map.monto_transaccion.astype(int)

In [112]:
pdf_map['edad'] = pdf_map.edad.astype(int)

In [113]:
pdf_map['maritalstatus_map'] = pdf_map.maritalstatus_map.astype(int)

In [114]:
pdf_map['gender_map'] = pdf_map.gender_map.astype(int)

In [115]:
pdf_map['nombre_transaccion_map'] = pdf_map.nombre_transaccion_map.astype(int)

In [116]:
pdf_map

Unnamed: 0,monto_transaccion,edad,maritalstatus_map,gender_map,nombre_transaccion_map
0,5370569,71,1,0,0
1,5370569,72,0,0,1
2,5370569,72,1,1,1
3,5370569,74,1,1,1
4,5370569,74,1,1,1
5,5005371,75,1,0,1
6,5000000,70,1,0,1
7,5000000,71,1,0,0
8,5000000,75,1,1,0
9,5000000,71,1,0,1


In [89]:
drop_elements = ['monto_transaccion']
X = pdf_map.drop(drop_elements, axis = 1)

In [90]:
X

Unnamed: 0,edad,maritalstatus_map,gender_map,nombre_transaccion_map
0,71,1,0,0
1,72,0,0,1
2,72,1,1,1
3,74,1,1,1
4,74,1,1,1
5,75,1,0,1
6,70,1,0,1
7,71,1,0,0
8,75,1,1,0
9,71,1,0,1


In [91]:
y = pdf_map['monto_transaccion']

In [92]:
clf = svm.SVR()
clf.fit(X, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [142]:
""" Para un cliente con: (EDAD = 66, CASADO(1), MASCULINO(1)), el monto a otrogar es: 1.000.005 para un crédito AC(0) """
r = clf.predict([[66, 1, 1, 1]])
int(r[0])

100005

In [141]:
""" Para un cliente con: (EDAD = 66, CASADO(1), FEMENINO(0)), el monto a otrogar es: 1.000.000 para un crédito AC(0) """
r = clf.predict([[66, 1, 1, 0]])
int(r[0])

100000