## classification on the dataset where null values are filled with mean and mode

In [1]:
import findspark
findspark.init('/home/abhi/spark-2.2.1-bin-hadoop2.7')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('classifier').getOrCreate()


In [3]:
dataframe = spark.read.csv('/home/abhi/project/cleaned_data_d3/d3.csv',inferSchema=True,header=True)

## schema of the dataframe

In [4]:
dataframe.printSchema()

root
 |-- Var6: integer (nullable = true)
 |-- Var7: integer (nullable = true)
 |-- Var13: integer (nullable = true)
 |-- Var21: integer (nullable = true)
 |-- Var22: integer (nullable = true)
 |-- Var24: integer (nullable = true)
 |-- Var25: integer (nullable = true)
 |-- Var28: double (nullable = true)
 |-- Var35: integer (nullable = true)
 |-- Var38: integer (nullable = true)
 |-- Var44: integer (nullable = true)
 |-- Var57: double (nullable = true)
 |-- Var65: integer (nullable = true)
 |-- Var72: integer (nullable = true)
 |-- Var73: integer (nullable = true)
 |-- Var74: integer (nullable = true)
 |-- Var76: integer (nullable = true)
 |-- Var78: integer (nullable = true)
 |-- Var81: double (nullable = true)
 |-- Var83: integer (nullable = true)
 |-- Var85: integer (nullable = true)
 |-- Var94: integer (nullable = true)
 |-- Var109: integer (nullable = true)
 |-- Var112: integer (nullable = true)
 |-- Var113: double (nullable = true)
 |-- Var119: integer (nullable = true)
 |-- Var1

## number of attributes in the dataframe

In [5]:
len(dataframe.columns)

75

## number of rows in the dataframe

In [6]:
total_rows = dataframe.count()

In [7]:
total_rows 

50000

In [8]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler,OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

## classifying the columns based on the type of data present in the column

In [9]:
string_list = []
num_list = []
for name,dtype in dataframe.dtypes:
    if(dtype == 'string'):
        string_list.append(name)
    if(dtype != 'string'):
        num_list.append(name)

## appending the column name label to string list

In [10]:
string_list.append('Label')

## list of the columns that contain categorical data

In [11]:
string_list

['Var192',
 'Var193',
 'Var195',
 'Var196',
 'Var197',
 'Var198',
 'Var199',
 'Var200',
 'Var202',
 'Var203',
 'Var204',
 'Var205',
 'Var206',
 'Var207',
 'Var208',
 'Var210',
 'Var211',
 'Var212',
 'Var214',
 'Var216',
 'Var217',
 'Var218',
 'Var219',
 'Var220',
 'Var221',
 'Var222',
 'Var223',
 'Var225',
 'Var226',
 'Var227',
 'Var228',
 'Var229',
 'Label']

## using string indexer to convert the categorical data into numeric

In [12]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(dataframe) for column in string_list]


## using pipeline to execute the operation in stages

In [13]:
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(dataframe).transform(dataframe)


## getting the list of columns(categorical) that are converted into numerical

In [14]:
index_list = []
for name,dtype  in df_r.dtypes:
    if(name.endswith('_index')):
         index_list.append(name)

## using onehotencoder to convert the categorical data into vector

In [15]:
encoders = [OneHotEncoder(inputCol=column, outputCol=column+"vec") for column in index_list]


In [16]:
pipeline = Pipeline(stages=encoders)
df_r = pipeline.fit(df_r).transform(df_r)


## schema of the dataframe after applying onehotencoder

In [17]:
df_r.printSchema()

root
 |-- Var6: integer (nullable = true)
 |-- Var7: integer (nullable = true)
 |-- Var13: integer (nullable = true)
 |-- Var21: integer (nullable = true)
 |-- Var22: integer (nullable = true)
 |-- Var24: integer (nullable = true)
 |-- Var25: integer (nullable = true)
 |-- Var28: double (nullable = true)
 |-- Var35: integer (nullable = true)
 |-- Var38: integer (nullable = true)
 |-- Var44: integer (nullable = true)
 |-- Var57: double (nullable = true)
 |-- Var65: integer (nullable = true)
 |-- Var72: integer (nullable = true)
 |-- Var73: integer (nullable = true)
 |-- Var74: integer (nullable = true)
 |-- Var76: integer (nullable = true)
 |-- Var78: integer (nullable = true)
 |-- Var81: double (nullable = true)
 |-- Var83: integer (nullable = true)
 |-- Var85: integer (nullable = true)
 |-- Var94: integer (nullable = true)
 |-- Var109: integer (nullable = true)
 |-- Var112: integer (nullable = true)
 |-- Var113: double (nullable = true)
 |-- Var119: integer (nullable = true)
 |-- Var1

## selecting the required columns(features)

In [18]:
features_list = []
for name,dtype in df_r.dtypes:
    if(name != 'Label'and name != 'Label_indexvec' and dtype != 'string' and name != 'Label_index' and name.endswith("_index") != True):
        features_list.append(name)

## list of feature columns names

In [19]:
features_list

['Var6',
 'Var7',
 'Var13',
 'Var21',
 'Var22',
 'Var24',
 'Var25',
 'Var28',
 'Var35',
 'Var38',
 'Var44',
 'Var57',
 'Var65',
 'Var72',
 'Var73',
 'Var74',
 'Var76',
 'Var78',
 'Var81',
 'Var83',
 'Var85',
 'Var94',
 'Var109',
 'Var112',
 'Var113',
 'Var119',
 'Var123',
 'Var125',
 'Var126',
 'Var132',
 'Var133',
 'Var134',
 'Var140',
 'Var143',
 'Var144',
 'Var149',
 'Var153',
 'Var160',
 'Var163',
 'Var173',
 'Var181',
 'Var189',
 'Var192_indexvec',
 'Var193_indexvec',
 'Var195_indexvec',
 'Var196_indexvec',
 'Var197_indexvec',
 'Var198_indexvec',
 'Var199_indexvec',
 'Var200_indexvec',
 'Var202_indexvec',
 'Var203_indexvec',
 'Var204_indexvec',
 'Var205_indexvec',
 'Var206_indexvec',
 'Var207_indexvec',
 'Var208_indexvec',
 'Var210_indexvec',
 'Var211_indexvec',
 'Var212_indexvec',
 'Var214_indexvec',
 'Var216_indexvec',
 'Var217_indexvec',
 'Var218_indexvec',
 'Var219_indexvec',
 'Var220_indexvec',
 'Var221_indexvec',
 'Var222_indexvec',
 'Var223_indexvec',
 'Var225_indexvec',
 '

## using vector assembler to zip the features columns into one column

In [20]:
assembler = VectorAssembler(inputCols=features_list,outputCol="features")

In [21]:
output = assembler.transform(df_r)

## creating a new dataframe with feature column and label column

In [22]:
final_data = output.select('Label_index','features')

## schema of the final dataframe

In [23]:
final_data.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)



## applying decisontreeclassifier

In [24]:
dt = DecisionTreeClassifier(labelCol='Label_index',featuresCol='features')

## splitting 70% of data for training and 30% into testing

In [25]:
training_data,test_data = final_data.randomSplit([0.7,0.3])

## schema of training_data

In [26]:
training_data.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)



## schema of test_data

In [27]:
test_data.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)



## first row of training_data

In [28]:
training_data.head()

Row(Label_index=0.0, features=SparseVector(71505, {0: 1176.0, 1: 7.0, 2: 1744.0, 3: 304.0, 4: 380.0, 5: 10.0, 6: 176.0, 7: 275.12, 8: 5.0, 9: 5681844.0, 10: 9.0, 11: 6.3074, 12: 18.0, 13: 3.0, 14: 40.0, 15: 84.0, 16: 2158176.0, 17: 9.0, 18: 146413.5, 19: 30.0, 20: 6.0, 21: 113649.0, 22: 104.0, 23: 96.0, 24: -609916.0, 25: 1275.0, 26: 84.0, 27: 21978.0, 28: 14.0, 29: 40.0, 30: 1699070.0, 31: 247190.0, 32: 1805.0, 34: 27.0, 35: 308266.0, 36: 8058320.0, 37: 38.0, 38: 42054.0, 41: 270.0, 138: 1.0, 402: 1.0, 452: 1.0, 474: 1.0, 506: 1.0, 1530: 1.0, 5082: 1.0, 10063: 1.0, 27500: 1.0, 31189: 1.0, 31227: 1.0, 31292: 1.0, 31295: 1.0, 31314: 1.0, 31327: 1.0, 31328: 1.0, 31333: 1.0, 31334: 1.0, 31414: 1.0, 46828: 1.0, 59895: 1.0, 62832: 1.0, 62833: 1.0, 63704: 1.0, 67144: 1.0, 67992: 1.0, 71440: 1.0, 71443: 1.0, 71455: 1.0, 71467: 1.0, 71473: 1.0, 71502: 1.0}))

## first row of the test_data

In [29]:
test_data.head()

Row(Label_index=0.0, features=SparseVector(71505, {0: 1505.0, 1: 7.0, 2: 2148.0, 3: 156.0, 4: 195.0, 5: 2.0, 6: 80.0, 7: 253.52, 8: 5.0, 9: 2448.0, 10: 9.0, 11: 4.9959, 12: 18.0, 13: 3.0, 14: 130.0, 15: 224.0, 16: 1170240.0, 17: 3.0, 18: 13214.58, 19: 5.0, 20: 6.0, 21: 12159.0, 22: 48.0, 23: 48.0, 24: -93952.8, 25: 510.0, 26: 6.0, 27: 24273.0, 28: 4.0, 29: 24.0, 30: 2882780.0, 31: 322632.0, 32: 470.0, 34: 18.0, 35: 188272.0, 36: 3712616.0, 37: 36.0, 38: 246174.0, 41: 270.0, 71: 1.0, 403: 1.0, 453: 1.0, 474: 1.0, 602: 1.0, 775: 1.0, 4998: 1.0, 19499: 1.0, 26220: 1.0, 31189: 1.0, 31272: 1.0, 31292: 1.0, 31302: 1.0, 31314: 1.0, 31327: 1.0, 31328: 1.0, 31333: 1.0, 31335: 1.0, 43499: 1.0, 46831: 1.0, 55579: 1.0, 62833: 1.0, 62929: 1.0, 67144: 1.0, 67224: 1.0, 71441: 1.0, 71443: 1.0, 71465: 1.0, 71467: 1.0, 71474: 1.0, 71502: 1.0}))

## top 20rows of features column

In [30]:
final_data.select(final_data.columns[1]).show()

+--------------------+
|            features|
+--------------------+
|(71505,[0,1,2,3,4...|
|(71505,[0,3,4,5,6...|
|(71505,[0,1,2,3,4...|
|(71505,[0,3,5,7,1...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,3,4,5,6...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,2,5,7,1...|
|(71505,[0,3,4,5,6...|
|(71505,[0,1,2,3,4...|
|(71505,[0,2,3,4,7...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,2,3,4...|
|(71505,[0,1,3,4,5...|
+--------------------+
only showing top 20 rows



## building model

In [31]:
model = dt.fit(training_data)

## predictions on the test_data

In [32]:
predictions = model.transform(test_data)

## count of each class label

In [33]:
output.groupBy('Label_index').count().show()

+-----------+-----+
|Label_index|count|
+-----------+-----+
|        0.0|46328|
|        1.0| 3672|
+-----------+-----+



## Evaluation Metrics

In [34]:
from pyspark.mllib.evaluation import MulticlassMetrics


## schema of the predictions

In [35]:
predictions.printSchema()

root
 |-- Label_index: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [36]:
pl = predictions.select(['prediction','Label_index'])

## using MulticlassMetrics for evalution

In [37]:
metrics = MulticlassMetrics(pl.rdd)

## True postive rate

In [38]:
metrics.recall(1)

0.00545950864422202

## Accuracy(positive rate)

In [39]:
metrics.accuracy

0.9265914100594482

## confusion matrix evaluation on test data

In [40]:
tp = pl[(pl.Label_index == 1.0) & (pl.prediction == 1.0)].count()
tn = pl[(pl.Label_index == 0.0) & (pl.prediction == 0.0)].count()
fp = pl[(pl.Label_index == 0.0) & (pl.prediction == 1.0)].count()
fn = pl[(pl.Label_index == 1.0) & (pl.prediction == 0.0)].count()

In [41]:
print("true positive",tp)
print("true negatives",tn)
print("false postives",fp)
print("false negatives",fn)

true positive 6
true negatives 13866
false postives 6
false negatives 1093


## under Sampling to overcome class imbalance

## selecting rows from majority class (number of rows equals to number of rows in minority class)

In [42]:
under_sample = output.sampleBy('Label_index',fractions={0: 3672.0/46328.0, 1: 1.0})

## total number of rows after undersampling

In [43]:
under_sample.count()

7268

## number of rows for each class label

In [44]:
under_sample.groupBy('Label_index').count().show()

+-----------+-----+
|Label_index|count|
+-----------+-----+
|        0.0| 3596|
|        1.0| 3672|
+-----------+-----+



## vector assembler to zip required columns into one column

In [45]:
us_assembler = VectorAssembler(inputCols=features_list,outputCol="us_features")

In [46]:
us_output = us_assembler.transform(under_sample)

## creating a new dataframe with feature column and label column

In [47]:
us_final_data = us_output.select('Label_index','us_features')

## applying decisontreeclassifier 

In [48]:
us_dt = DecisionTreeClassifier(labelCol='Label_index',featuresCol='us_features')

## splitting 70% of data for training and 30% into testing

In [49]:
us_training_data,us_test_data = us_final_data.randomSplit([0.7,0.3])

## building model

In [50]:
us_model = us_dt.fit(us_training_data)

## predictions on the test_data

In [51]:
us_predictions = us_model.transform(us_test_data)

In [52]:
us_pl = us_predictions.select(['prediction','Label_index'])

## using MulticlassMetrics for evalution

In [53]:
us_metrics = MulticlassMetrics(us_pl.rdd)

## True postive rate for undersampling

In [54]:
us_metrics.recall(1)

0.7027522935779816

## confusion matrix evaluation on test data

In [55]:
tp = us_pl[(us_pl.Label_index == 1.0) & (us_pl.prediction == 1.0)].count()
tn = us_pl[(us_pl.Label_index == 0.0) & (us_pl.prediction == 0.0)].count()
fp = us_pl[(us_pl.Label_index == 0.0) & (us_pl.prediction == 1.0)].count()
fn = us_pl[(us_pl.Label_index == 1.0) & (us_pl.prediction == 0.0)].count()

In [56]:
print("true positive",tp)
print("true negatives",tn)
print("false postives",fp)
print("false negatives",fn)

true positive 766
true negatives 580
false postives 484
false negatives 324


## Accuracy(positive rate) for undersampling

In [57]:
us_metrics.accuracy

0.6248839368616528

## oversampling  to overcome class imbalance

## selecting the minority class

In [58]:
from pyspark.sql.functions import *
minoriy_class = df_r.filter(df_r.Label_index == 1.0).select("*")

In [59]:
majority_rows = df_r.filter(df_r.Label_index == 0.0).count()

In [60]:
minority_rows = df_r.filter(df_r.Label_index == 1.0).count()

In [61]:
replication = (majority_rows/minority_rows) -1

## replicating the minority class such that its proportion equals to majority class

In [62]:
over_sample = df_r
i = 0
while(i<replication):
    over_sample = over_sample.union(minoriy_class)
    i+=1

## number of rows in each class after oversampling

In [63]:
over_sample.groupBy('Label_index').count().show()

+-----------+-----+
|Label_index|count|
+-----------+-----+
|        0.0|46328|
|        1.0|47736|
+-----------+-----+



## using vector assembler to zip required columns into one column

In [64]:
os_assembler = VectorAssembler(inputCols=features_list,outputCol="os_features")

In [65]:
os_output = os_assembler.transform(over_sample)

## selecting feature column and label column

In [66]:
os_final_data = os_output.select('Label_index','os_features')

## applying decision tree classifier

In [67]:
os_dt = DecisionTreeClassifier(labelCol='Label_index',featuresCol='os_features')

## splitting 70% of data for training and 30% into testing

In [68]:
os_training_data,os_test_data = os_final_data.randomSplit([0.7,0.3])

## building model

In [69]:
os_model = os_dt.fit(os_training_data)

## predictions on the test_data

In [70]:
os_predictions = os_model.transform(os_test_data)

In [71]:
os_pl = os_predictions.select(['prediction','Label_index'])

## using MulticlassMetrics for evalution

In [72]:
os_metrics = MulticlassMetrics(os_pl.rdd)

## True positive rate for oversampling

In [73]:
os_metrics.recall(1)

0.7287992693037307

## confusion matrix evaluation on test data

In [74]:
tp = os_pl[(os_pl.Label_index == 1.0) & (os_pl.prediction == 1.0)].count()
tn = os_pl[(os_pl.Label_index == 0.0) & (os_pl.prediction == 0.0)].count()
fp = os_pl[(os_pl.Label_index == 0.0) & (os_pl.prediction == 1.0)].count()
fn = os_pl[(os_pl.Label_index == 1.0) & (os_pl.prediction == 0.0)].count()

In [75]:
print("true positive",tp)
print("true negatives",tn)
print("false postives",fp)
print("false negatives",fn)

true positive 10373
true negatives 7974
false postives 5870
false negatives 3860


## Accuracy(positive rate) for oversampling

In [76]:
os_metrics.accuracy

0.6534530042383445