## classification on the dataset where null values are removed

In [1]:
import findspark
findspark.init('/home/abhi/spark-2.2.1-bin-hadoop2.7')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('classifier').getOrCreate()


In [3]:
dataframe = spark.read.csv('/home/abhi/project/cleaned_data.csv',inferSchema=True,header=True,sep=',')

## schema of the dataframe

In [4]:
dataframe.printSchema()

root
 |-- Var6: integer (nullable = true)
 |-- Var7: integer (nullable = true)
 |-- Var13: integer (nullable = true)
 |-- Var21: integer (nullable = true)
 |-- Var22: integer (nullable = true)
 |-- Var24: integer (nullable = true)
 |-- Var25: integer (nullable = true)
 |-- Var28: double (nullable = true)
 |-- Var35: integer (nullable = true)
 |-- Var38: integer (nullable = true)
 |-- Var44: integer (nullable = true)
 |-- Var57: double (nullable = true)
 |-- Var65: integer (nullable = true)
 |-- Var72: integer (nullable = true)
 |-- Var73: integer (nullable = true)
 |-- Var74: integer (nullable = true)
 |-- Var76: integer (nullable = true)
 |-- Var78: integer (nullable = true)
 |-- Var81: double (nullable = true)
 |-- Var83: integer (nullable = true)
 |-- Var85: integer (nullable = true)
 |-- Var94: integer (nullable = true)
 |-- Var109: integer (nullable = true)
 |-- Var112: integer (nullable = true)
 |-- Var113: double (nullable = true)
 |-- Var119: integer (nullable = true)
 |-- Var1

## number of attributes in the dataframe

In [5]:
len(dataframe.columns)

75

## number of rows in the dataframe

In [6]:
total_rows = dataframe.count()

In [7]:
total_rows 

3238

## classifying the columns based on the type of data present in the column

In [8]:
string_list = []
num_list = []
for name,dtype in dataframe.dtypes:
    if(dtype == 'string'):
        string_list.append(name)
    if(dtype != 'string'):
        num_list.append(name)

## appending the column name label to string list

In [9]:
string_list.append('Label')

## list of the columns that contain categorical data

In [10]:
string_list

['Var192',
 'Var193',
 'Var195',
 'Var196',
 'Var197',
 'Var198',
 'Var199',
 'Var200',
 'Var202',
 'Var203',
 'Var204',
 'Var205',
 'Var206',
 'Var207',
 'Var208',
 'Var210',
 'Var211',
 'Var212',
 'Var214',
 'Var216',
 'Var217',
 'Var218',
 'Var219',
 'Var220',
 'Var221',
 'Var222',
 'Var223',
 'Var225',
 'Var226',
 'Var227',
 'Var228',
 'Var229',
 'Label']

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler,OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

## using string indexer to convert the categorical data into numeric

In [12]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(dataframe) for column in string_list]


## using pipeline to execute the operation in stages

In [13]:
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(dataframe).transform(dataframe)


## getting the list of columns(categorical) that are converted into numerical

In [14]:
index_list = []
for name,dtype  in df_r.dtypes:
    if(name.endswith('_index')):
         index_list.append(name)

## using onehotencoder to convert the categorical data into vector

In [15]:
encoders = [OneHotEncoder(inputCol=column, outputCol=column+"vec") for column in index_list]


In [16]:
pipeline = Pipeline(stages=encoders)
df_r = pipeline.fit(df_r).transform(df_r)


## schema of the dataframe after applying onehotencoder

In [17]:
df_r.printSchema()

root
 |-- Var6: integer (nullable = true)
 |-- Var7: integer (nullable = true)
 |-- Var13: integer (nullable = true)
 |-- Var21: integer (nullable = true)
 |-- Var22: integer (nullable = true)
 |-- Var24: integer (nullable = true)
 |-- Var25: integer (nullable = true)
 |-- Var28: double (nullable = true)
 |-- Var35: integer (nullable = true)
 |-- Var38: integer (nullable = true)
 |-- Var44: integer (nullable = true)
 |-- Var57: double (nullable = true)
 |-- Var65: integer (nullable = true)
 |-- Var72: integer (nullable = true)
 |-- Var73: integer (nullable = true)
 |-- Var74: integer (nullable = true)
 |-- Var76: integer (nullable = true)
 |-- Var78: integer (nullable = true)
 |-- Var81: double (nullable = true)
 |-- Var83: integer (nullable = true)
 |-- Var85: integer (nullable = true)
 |-- Var94: integer (nullable = true)
 |-- Var109: integer (nullable = true)
 |-- Var112: integer (nullable = true)
 |-- Var113: double (nullable = true)
 |-- Var119: integer (nullable = true)
 |-- Var1

## selecting the required columns(features)

In [18]:
features_list = []
for name,dtype in df_r.dtypes:
    if(name != 'Label'and name != 'Label_indexvec' and dtype != 'string' and name != 'Label_index' and name.endswith("_index") != True):
        features_list.append(name)

## list of feature columns names

In [19]:
features_list

['Var6',
 'Var7',
 'Var13',
 'Var21',
 'Var22',
 'Var24',
 'Var25',
 'Var28',
 'Var35',
 'Var38',
 'Var44',
 'Var57',
 'Var65',
 'Var72',
 'Var73',
 'Var74',
 'Var76',
 'Var78',
 'Var81',
 'Var83',
 'Var85',
 'Var94',
 'Var109',
 'Var112',
 'Var113',
 'Var119',
 'Var123',
 'Var125',
 'Var126',
 'Var132',
 'Var133',
 'Var134',
 'Var140',
 'Var143',
 'Var144',
 'Var149',
 'Var153',
 'Var160',
 'Var163',
 'Var173',
 'Var181',
 'Var189',
 'Var192_indexvec',
 'Var193_indexvec',
 'Var195_indexvec',
 'Var196_indexvec',
 'Var197_indexvec',
 'Var198_indexvec',
 'Var199_indexvec',
 'Var200_indexvec',
 'Var202_indexvec',
 'Var203_indexvec',
 'Var204_indexvec',
 'Var205_indexvec',
 'Var206_indexvec',
 'Var207_indexvec',
 'Var208_indexvec',
 'Var210_indexvec',
 'Var211_indexvec',
 'Var212_indexvec',
 'Var214_indexvec',
 'Var216_indexvec',
 'Var217_indexvec',
 'Var218_indexvec',
 'Var219_indexvec',
 'Var220_indexvec',
 'Var221_indexvec',
 'Var222_indexvec',
 'Var223_indexvec',
 'Var225_indexvec',
 '

## using vector assembler to zip the features columns into one column

In [20]:
assembler = VectorAssembler(inputCols=features_list,outputCol="features")

In [21]:
output = assembler.transform(df_r)

## creating a new dataframe with  feature column and label column

In [22]:
final_data = output.select('Label_index','features')

## schema of the final dataframe

In [23]:
final_data.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)



## applying decisontreeclassifier 

In [24]:
dt = DecisionTreeClassifier(labelCol='Label_index',featuresCol='features')

## splitting 70% of data for training and 30% into testing

In [25]:
training_data,test_data = final_data.randomSplit([0.7,0.3])

## schema of training_data 

In [26]:
training_data.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)



## schema of test_data

In [27]:
test_data.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)



## first row of training_data 

In [28]:
training_data.head()

Row(Label_index=0.0, features=SparseVector(15055, {0: 1267.0, 1: 7.0, 2: 108.0, 3: 504.0, 4: 630.0, 5: 8.0, 6: 136.0, 7: 100.8, 8: 5.0, 9: 5759904.0, 10: 9.0, 11: 2.0598, 12: 9.0, 13: 3.0, 14: 52.0, 15: 35.0, 16: 1772592.0, 17: 6.0, 18: 103393.5, 19: 35.0, 20: 10.0, 21: 141747.0, 22: 64.0, 23: 104.0, 24: 184596.4, 25: 1985.0, 26: 186.0, 27: 7812.0, 28: -20.0, 29: 32.0, 30: 2932195.0, 31: 415204.0, 32: 850.0, 34: 9.0, 35: 515270.0, 36: 10199160.0, 37: 106.0, 38: 689622.0, 41: 306.0, 95: 1.0, 263: 1.0, 291: 1.0, 302: 1.0, 307: 1.0, 607: 1.0, 1458: 1.0, 4354: 1.0, 5500: 1.0, 7057: 1.0, 7069: 1.0, 7155: 1.0, 7165: 1.0, 7177: 1.0, 7185: 1.0, 7186: 1.0, 7190: 1.0, 7823: 1.0, 10263: 1.0, 10962: 1.0, 12975: 1.0, 13129: 1.0, 13992: 1.0, 14133: 1.0, 14999: 1.0, 15003: 1.0, 15004: 1.0, 15026: 1.0, 15032: 1.0, 15052: 1.0}))

## first row of the test_data

In [29]:
test_data.head()

Row(Label_index=0.0, features=SparseVector(15055, {0: 2394.0, 1: 21.0, 2: 3836.0, 3: 208.0, 4: 260.0, 5: 4.0, 6: 72.0, 7: 114.64, 8: 5.0, 9: 2341650.0, 10: 9.0, 11: 5.4935, 12: 27.0, 13: 9.0, 14: 180.0, 15: 420.0, 16: 3136968.0, 17: 18.0, 18: 63711.3, 19: 5.0, 20: 18.0, 21: 125739.0, 22: 24.0, 23: 48.0, 24: -387325.2, 25: 1465.0, 26: 48.0, 27: 74808.0, 28: -30.0, 29: 72.0, 30: 2734495.0, 31: 241708.0, 32: 8300.0, 34: 36.0, 35: 1049167.0, 36: 6087000.0, 37: 38.0, 38: 345114.0, 40: 7.0, 41: 276.0, 151: 1.0, 263: 1.0, 291: 1.0, 302: 1.0, 315: 1.0, 455: 1.0, 1563: 1.0, 4438: 1.0, 5917: 1.0, 7057: 1.0, 7060: 1.0, 7155: 1.0, 7159: 1.0, 7178: 1.0, 7185: 1.0, 7186: 1.0, 7189: 1.0, 7191: 1.0, 9297: 1.0, 10267: 1.0, 12758: 1.0, 12974: 1.0, 12975: 1.0, 12991: 1.0, 13992: 1.0, 13998: 1.0, 14999: 1.0, 15002: 1.0, 15007: 1.0, 15027: 1.0, 15035: 1.0, 15052: 1.0}))

## top 20rows of features column

In [30]:
final_data.select(final_data.columns[1]).show()

+--------------------+
|            features|
+--------------------+
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
|(15055,[0,1,2,3,4...|
+--------------------+
only showing top 20 rows



## building model

In [31]:
model = dt.fit(training_data)

## predictions on the test_data

In [32]:
predictions = model.transform(test_data)

## count of each class label

In [33]:
output.groupBy('Label_index').count().show()

+-----------+-----+
|Label_index|count|
+-----------+-----+
|        0.0| 3135|
|        1.0|  103|
+-----------+-----+



## Evaluation metrics

In [34]:
from pyspark.mllib.evaluation import MulticlassMetrics


## schema of the predictions

In [35]:
predictions.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [36]:
pl = predictions.select(['prediction','Label_index'])

## using MulticlassMetrics for evalution

In [37]:
metrics = MulticlassMetrics(pl.rdd)

## True postive rate

In [38]:
metrics.recall(1)

0.0

## Accuracy(positive rate)

In [39]:
metrics.accuracy

0.9623389494549058

## confusion matrix evaluation on test data

In [40]:
tp = pl[(pl.Label_index == 1.0) & (pl.prediction == 1.0)].count()
tn = pl[(pl.Label_index == 0.0) & (pl.prediction == 0.0)].count()
fp = pl[(pl.Label_index == 0.0) & (pl.prediction == 1.0)].count()
fn = pl[(pl.Label_index == 1.0) & (pl.prediction == 0.0)].count()

In [41]:
print("true positive",tp)
print("true negatives",tn)
print("false postives",fp)
print("false negatives",fn)

true positive 0
true negatives 971
false postives 8
false negatives 30
