# Preprocessing

## Removing the rows that contain null values

In [1]:
import findspark

In [2]:
findspark.init('/home/abhi/spark-2.2.1-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('preprocess').getOrCreate()

# ORIGINAL DATA

In [5]:
dataframe=spark.read.csv('/home/abhi/project/orange_small_train.data',inferSchema=True,header=True,sep='\t')

# Number Of Rows in Original Dataset

In [6]:
dataframe.count()

50000

# GROUND TRUTH

In [7]:
churn_label=spark.read.csv('/home/abhi/project/orange_small_train_churn.labels')

# Number of rows in Ground Truth

In [8]:
churn_label.count()

50000

# Number of Attributes (Columns) in Original dataset

In [9]:
len(dataframe.columns)

230

# Column-wise Null count

In [10]:
from pyspark.sql.functions import isnull, when, count, col

null_count=(dataframe.select([count(when(isnull(c), c)).alias(c) for c in dataframe.columns]))

# Selecting the columns that have given threshold

In [11]:
my_list=[]
nRows = dataframe.count()
i=0
while(i<len(dataframe.columns)) :
    if(null_count.collect()[0][i]<0.60*nRows):
        my_list.append(null_count.columns[i])
    i+=1

# List of columns that satisfies the given Threshold

In [12]:
column_len = len(my_list)

# Creating a new dataframe for the columns that satisfy threshold

In [13]:
final_data=dataframe.select(my_list)

# Number of rows in new dataframe (final_data)

In [14]:
final_data.count()

50000

# Number of rows doesn't contain any NULL value

In [15]:
final_data.dropna().count()

3238

# using lit() to add a new column that only '1' in each row 

In [16]:
from pyspark.sql.functions import lit

final_data = final_data.withColumn("order",lit(1))
churn_label = churn_label.withColumn("order",lit(1))

# creating another column rowNum that contains consecutive values

In [17]:
from pyspark.sql.types import *
from pyspark.sql import Row, functions as F
from pyspark.sql.window import Window

final_data = final_data.select("*", F.row_number().over(Window.partitionBy("order").orderBy("order")).alias("rowNum"))
churn_label = churn_label.select("*", F.row_number().over(Window.partitionBy("order").orderBy("order")).alias("rowNum"))

# schema of final_data

In [18]:
final_data.printSchema()

root
 |-- Var6: integer (nullable = true)
 |-- Var7: integer (nullable = true)
 |-- Var13: integer (nullable = true)
 |-- Var21: integer (nullable = true)
 |-- Var22: integer (nullable = true)
 |-- Var24: integer (nullable = true)
 |-- Var25: integer (nullable = true)
 |-- Var28: double (nullable = true)
 |-- Var35: integer (nullable = true)
 |-- Var38: integer (nullable = true)
 |-- Var44: integer (nullable = true)
 |-- Var57: double (nullable = true)
 |-- Var65: integer (nullable = true)
 |-- Var72: integer (nullable = true)
 |-- Var73: integer (nullable = true)
 |-- Var74: integer (nullable = true)
 |-- Var76: integer (nullable = true)
 |-- Var78: integer (nullable = true)
 |-- Var81: double (nullable = true)
 |-- Var83: integer (nullable = true)
 |-- Var85: integer (nullable = true)
 |-- Var94: integer (nullable = true)
 |-- Var109: integer (nullable = true)
 |-- Var112: integer (nullable = true)
 |-- Var113: double (nullable = true)
 |-- Var119: integer (nullable = true)
 |-- Var1

# joining two dataframes ,final_data and churn_label(ground truth) 

In [19]:
result = final_data.join(churn_label,final_data.rowNum == churn_label.rowNum)

# schema of result dataframe

In [20]:
result.printSchema()

root
 |-- Var6: integer (nullable = true)
 |-- Var7: integer (nullable = true)
 |-- Var13: integer (nullable = true)
 |-- Var21: integer (nullable = true)
 |-- Var22: integer (nullable = true)
 |-- Var24: integer (nullable = true)
 |-- Var25: integer (nullable = true)
 |-- Var28: double (nullable = true)
 |-- Var35: integer (nullable = true)
 |-- Var38: integer (nullable = true)
 |-- Var44: integer (nullable = true)
 |-- Var57: double (nullable = true)
 |-- Var65: integer (nullable = true)
 |-- Var72: integer (nullable = true)
 |-- Var73: integer (nullable = true)
 |-- Var74: integer (nullable = true)
 |-- Var76: integer (nullable = true)
 |-- Var78: integer (nullable = true)
 |-- Var81: double (nullable = true)
 |-- Var83: integer (nullable = true)
 |-- Var85: integer (nullable = true)
 |-- Var94: integer (nullable = true)
 |-- Var109: integer (nullable = true)
 |-- Var112: integer (nullable = true)
 |-- Var113: double (nullable = true)
 |-- Var119: integer (nullable = true)
 |-- Var1

# dropping order and rowNum  columns

In [21]:
result = result.drop("order","rowNum")

# number of rows in result dataframe

In [22]:
result.count()

50000

# rows that doesn't contain null 

In [23]:
result = result.dropna()

In [24]:
result.count()

3238

# Renaming the column _c0 to label

In [25]:
result = result.withColumnRenamed("_c0","Label")

# Generating a csv file of result

In [27]:
result.repartition(1).write.csv('/home/abhi/project/cleaned_data1',sep=',',header=True)