In [1]:
import warnings
warnings.filterwarnings('ignore')

import datetime
import pandas as pd
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("MoveoutPrediction") \
    .getOrCreate()

In [4]:
pandasDF = pd.read_excel( "./../data/714314_Avana_Rancho_Cucamonga.xlsx" )
moveoutSchema = StructType( [ StructField("id", LongType() )\
                       ,StructField( "name", StringType() )\
                       ,StructField( "start_date", StringType() )\
                       ,StructField( "move_out_date", StringType(), True )\
                       ,StructField( "end_date", StringType(), True )\
                       ,StructField( "moved_out", ShortType() ) ] )

In [5]:
sdf = spark.createDataFrame( pandasDF, schema = moveoutSchema )
sdf = sdf.withColumn( "months", round( months_between( col( "move_out_date" ), col( "start_date" ) ) ) )


sdf = sdf.replace('NaN', None)
# show only 5 rows
sdf.show(5)

+--------+-----------------+----------+-------------+----------+---------+------+
|      id|             name|start_date|move_out_date|  end_date|moved_out|months|
+--------+-----------------+----------+-------------+----------+---------+------+
|13502853|     Carol Newman|2021-07-23|         null|2023-07-19|        1|  null|
|13026843|Anthony Hernandez|2019-03-28|   2019-06-06|2020-02-21|        0|   2.0|
|13132170|Nuha Abu Mayyaleh|2019-09-06|   2020-10-12|2020-10-12|        0|  13.0|
|13620080|Anjanette Ballard|2022-01-05|         null|2023-05-04|        1|  null|
|13481247|       Joseph Kim|2021-07-21|   2022-06-24|2022-06-24|        0|  11.0|
+--------+-----------------+----------+-------------+----------+---------+------+
only showing top 5 rows



In [6]:
approxQuantile = sdf.approxQuantile("months", [ 0.5 ], 0.25)
print(approxQuantile)
sdf = sdf.na.fill

[12.0]


In [9]:
# survival analysis
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

assembler = VectorAssembler(
    inputCols=[ "months" ],
    outputCol="features" )

modelDf = assembler.transform( sdf ).select( col("id").alias("label"), col("features"), col("moved_out").alias("censor") )

aftsr = AFTSurvivalRegression()
aftsr.setMaxIter(10)
model = aftsr.fit(modelDf)



model.transform(modelDf).show(5)

+--------+--------+------+--------------------+
|   label|features|censor|          prediction|
+--------+--------+------+--------------------+
|13026843|   [2.0]|     0|3.165306948511722E16|
|13132170|  [13.0]|     0|3.069828887084966E16|
|13481247|  [11.0]|     0|3.086971732142145...|
|13304982|  [18.0]|     0| 3.02738711513746E16|
|13310235|  [16.0]|     0|3.044292952613040...|
+--------+--------+------+--------------------+
only showing top 5 rows

