## Project 2: Exploratory Data Analysis (EDA) and Classification  
### Data Source: Wisconsin Breast Cancer Database  
### Total Possible Points: 10

### PART I: (EDA) Run the cells below and answer questions about the data  
Source dataset from UC Irvice Machine Learning Repo URL and save to project

In [102]:
import os
import os.path
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [103]:
url_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

In [104]:
df = pd.read_csv(url_data)

In [101]:
df.head()

Unnamed: 0,842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [6]:
df.shape

(568, 32)

In [105]:
outfile = 'data/wisc_breast_cancer.csv'

Save dataset to csv file

In [106]:
df.to_csv(outfile, index=False)

I then added a header row to the file, saved, and now I will read in the complete file with header

### Load data into Pyspark dataframe and do analysis

In [107]:
from pyspark.sql.functions import col # for filtering on columns
sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

In [108]:
infile_w_fields = 'data/wisc_breast_cancer_w_fields.csv'

In [109]:
brca_w_fields = pd.read_csv(infile_w_fields)

In [11]:
brca_w_fields.head()

Unnamed: 0,id,diagnosis,f1,f2,f3,f4,f5,f6,f7,f8,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Convert to Spark DataFrame

In [110]:
brca_df = sqlCtx.createDataFrame(brca_w_fields)

Show some rows

In [10]:
brca_df.show(2)

+------+---------+-----+-----+-----+------+-------+-------+------+-------------------+------+--------------------+------+------+-----+-----+--------+--------------------+-------------------+-------+--------------------+--------+-----+-----+-----+------+------+------+------+-------------------+------+-------+
|    id|diagnosis|   f1|   f2|   f3|    f4|     f5|     f6|    f7|                 f8|    f9|                 f10|   f11|   f12|  f13|  f14|     f15|                 f16|                f17|    f18|                 f19|     f20|  f21|  f22|  f23|   f24|   f25|   f26|   f27|                f28|   f29|    f30|
+------+---------+-----+-----+-----+------+-------+-------+------+-------------------+------+--------------------+------+------+-----+-----+--------+--------------------+-------------------+-------+--------------------+--------+-----+-----+-----+------+------+------+------+-------------------+------+-------+
|842302|        M|17.99|10.38|122.8|1001.0| 0.1184| 0.2776|0.3001|    

There are several columns, and it's a litle hard to read them all together. Select a few important fields.

In [110]:
brca_df.select(['id','diagnosis','f1']).show(5)

+--------+---------+-----+
|      id|diagnosis|   f1|
+--------+---------+-----+
|  842302|        M|17.99|
|  842517|        M|20.57|
|84300903|        M|19.69|
|84348301|        M|11.42|
|84358402|        M|20.29|
+--------+---------+-----+
only showing top 5 rows



In [111]:
brca_df.count()

569

In [112]:
brca_df.columns

['id',
 'diagnosis',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30']

In [17]:
# On diagnosis, compute frequency distribution
brca_df.groupBy("diagnosis").count().show()

+---------+-----+
|diagnosis|count|
+---------+-----+
|        B|  357|
|        M|  212|
+---------+-----+



In [113]:
brca_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- f1: double (nullable = true)
 |-- f2: double (nullable = true)
 |-- f3: double (nullable = true)
 |-- f4: double (nullable = true)
 |-- f5: double (nullable = true)
 |-- f6: double (nullable = true)
 |-- f7: double (nullable = true)
 |-- f8: double (nullable = true)
 |-- f9: double (nullable = true)
 |-- f10: double (nullable = true)
 |-- f11: double (nullable = true)
 |-- f12: double (nullable = true)
 |-- f13: double (nullable = true)
 |-- f14: double (nullable = true)
 |-- f15: double (nullable = true)
 |-- f16: double (nullable = true)
 |-- f17: double (nullable = true)
 |-- f18: double (nullable = true)
 |-- f19: double (nullable = true)
 |-- f20: double (nullable = true)
 |-- f21: double (nullable = true)
 |-- f22: double (nullable = true)
 |-- f23: double (nullable = true)
 |-- f24: double (nullable = true)
 |-- f25: double (nullable = true)
 |-- f26: double (nullable = true)
 |-- f27: double (nul

In [114]:
brca_df.select(['diagnosis','f1']).show(3)

+---------+-----+
|diagnosis|   f1|
+---------+-----+
|        M|17.99|
|        M|20.57|
|        M|19.69|
+---------+-----+
only showing top 3 rows



Minimum of field 'f1'

In [115]:
brca_df.agg({"f1": "min"}).collect()[0][0]

6.981

**QUESTIONS FOR PART 1 (EACH WORTH 1 POINT FOR 5 TOTAL POSSIBLE PTS)**

1a. Select and show first 10 rows of data from fields ‘diagnosis,’f1’

In [11]:
brca_df.select(['diagnosis','f1']).show(10)

+---------+-----+
|diagnosis|   f1|
+---------+-----+
|        M|17.99|
|        M|20.57|
|        M|19.69|
|        M|11.42|
|        M|20.29|
|        M|12.45|
|        M|18.25|
|        M|13.71|
|        M| 13.0|
|        M|12.46|
+---------+-----+
only showing top 10 rows



1b. Create dataframe df2 which contains only records where f1 > 20  
    Print the number of records in df2

In [12]:
df2=brca_df.where(brca_df.f1>20).count()
print(df2)


45


All remaining parts to Question 1 are based on the *brca_df* dataframe

1c. Compute max on column f1

In [13]:
brca_df.agg({"f1": "max"}).collect()[0][0]

28.11

1d. Show a summary on the fields: diagnosis, f1, f2  
hint: use describe()

In [14]:
brca_df.describe('diagnosis','f1','f2').show()

+-------+---------+------------------+-----------------+
|summary|diagnosis|                f1|               f2|
+-------+---------+------------------+-----------------+
|  count|      569|               569|              569|
|   mean|     null|14.127291739894552|19.28964850615114|
| stddev|     null| 3.524048826212077| 4.30103576816695|
|    min|        B|             6.981|             9.71|
|    max|        M|             28.11|            39.28|
+-------+---------+------------------+-----------------+



1e. Create a new field called f1sq which is the square of f1, and print the first 10 rows of the dataframe

In [50]:
df3=brca_df.withColumn('flsq',brca_df.f1**2).show(10)

+--------+---------+-----+-----+-----+------+-------------------+-------+-------------------+-------------------+------+--------------------+------+------------------+------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+-----+-----+------+------+------+------+-------------------+------+-------------------+------------------+
|      id|diagnosis|   f1|   f2|   f3|    f4|                 f5|     f6|                 f7|                 f8|    f9|                 f10|   f11|               f12|               f13|  f14|                 f15|                 f16|                 f17|                 f18|                 f19|     f20|  f21|  f22|  f23|   f24|   f25|   f26|   f27|                f28|   f29|                f30|              flsq|
+--------+---------+-----+-----+-----+------+-------------------+-------+-------------------+-------------------+------+--------------------+------+--------------

### PART II: (Classification Task) Run the cells below, fill in the missing steps, and answer questions about the data  

INSTRUCTIONS AND QUESTIONS COMING SOON

In [112]:
# load modules
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import VectorAssembler 
from pyspark.mllib.linalg import Vectors

In [113]:
brca_df.select(['diagnosis','f1','f2','f3']).show(2)

+---------+-----+-----+-----+
|diagnosis|   f1|   f2|   f3|
+---------+-----+-----+-----+
|        M|17.99|10.38|122.8|
|        M|20.57|17.77|132.9|
+---------+-----+-----+-----+
only showing top 2 rows



In [114]:
# Let's package some fields together to be used as features
assembler = VectorAssembler( inputCols=["f1", "f2", "f3"], outputCol="features") 

Applying transform() will add the *features* column to the dataframe

In [115]:
transformed = assembler.transform(brca_df)

In [116]:
transformed.select(["diagnosis", "features"]).show(2)

+---------+-------------------+
|diagnosis|           features|
+---------+-------------------+
|        M|[17.99,10.38,122.8]|
|        M|[20.57,17.77,132.9]|
+---------+-------------------+
only showing top 2 rows



In [20]:
# convert to RDD
dataRdd = transformed.select(["diagnosis","features"]).rdd.map(tuple)

In [117]:
# Look at some data
dataRdd.take(2)

[(u'M', DenseVector([17.99, 10.38, 122.8])),
 (u'M', DenseVector([20.57, 17.77, 132.9]))]

#### Get data into proper format for modeling:  
Map label to binary values, then convert to LabeledPoint  

LabeledPoint uses value=0 for negative labels, value=1 for positive labels

In [118]:
lp = dataRdd.map(lambda row : (1 if row[0]=='M' else 0, Vectors.dense(row[1])))    \
            .map(lambda row : LabeledPoint(row[0], row[1]))

In [119]:
lp.take(2)

[LabeledPoint(1.0, [17.99,10.38,122.8]),
 LabeledPoint(1.0, [20.57,17.77,132.9])]

**Split data approximately into training (60%) and test (40%)**

In [120]:
#training=lp.randomSplit([0.6,0.4],seed=314)[0]
#test=lp.randomSplit([0.6,0.4],seed=314)[1]
#training.cache()
training,test=lp.randomSplit([0.6,0.4],seed=314)

In [121]:
(1.0 * training.count()/lp.count(), 1.0 * test.count()/lp.count(), 1.0 * lp.count()/lp.count())

(0.6344463971880492, 0.3655536028119508, 1.0)

**Build the Logistic Regression Model**

In [132]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
model = LogisticRegressionWithLBFGS.train(training)

**For the test set, use map() to compute predictions by calling predict() on the model**

In [133]:
labelsAndPreds=test.map(lambda p: (p.label, model.predict(p.features)))

**Compute the accuracy**

In [134]:
accuracy = 1.0 * labelsAndPreds.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
print('model accuracy {}'.format(accuracy))


model accuracy 0.668269230769


In [135]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix()

DenseMatrix(2, 2, [97.0, 28.0, 41.0, 42.0], 0)

**QUESTIONS FOR PART 2  
POINT VALUES: 1 point for script, 1 point for each part 1-4. TOTAL POSSIBLE PTS = 5**

You will make modifications to the code above in PART II and rerun the code, collecting results.

At first, make the changes in this notebook.
Next you will write script to run the whole workflow.
Test it to be sure it works, and submit as part of the assignment. You will want the script to take arguments for intercept (1=yes, 0=no, for example)

The following experiments should be conducted:

We used three features in our model. Build the model using features f1, f2, f3, f4, f5.
Then compute the accuracy and confusion matrix on the test set.
Repeat step (1), including an intercept
Repeat step (1), using 70%/30% train/test split with seed=314.
Repeat step (2), using 70%/30% train/test split with seed=314.

**Please clearly summarize all results at the end of the notebook.  
You can modify the code below to start organizing the results into a pandas dataframe**

1. First Method

In [125]:
#brca_df.select(['diagnosis','f1','f2','f3','f4','f5']).show(2)
assembler1 = VectorAssembler( inputCols=["f1", "f2", "f3",'f4','f5'], outputCol="features")
transformed1 = assembler1.transform(brca_df)
#transformed.select(["diagnosis", "features"]).show(2)
# convert to RDD
dataRdd1 = transformed1.select(["diagnosis","features"]).rdd.map(tuple)
# Look at some data
#dataRdd1.take(2)
lp1 = dataRdd1.map(lambda row : (1 if row[0]=='M' else 0, Vectors.dense(row[1])))    \
            .map(lambda row : LabeledPoint(row[0], row[1]))
#lp1.take(2)
training1,test1=lp1.randomSplit([0.6,0.4],seed=314)
(1.0 * training1.count()/lp1.count(), 1.0 * test1.count()/lp1.count(), 1.0 * lp1.count()/lp1.count())


(0.6344463971880492, 0.3655536028119508, 1.0)

In [126]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
model1 = LogisticRegressionWithLBFGS.train(training1)

In [127]:
labelsAndPreds1=test1.map(lambda p: (p.label, model1.predict(p.features)))
accuracy1 = 1.0 * labelsAndPreds1.filter(lambda pl: pl[0] == pl[1]).count() / test1.count()
print('model accuracy {}'.format(accuracy1))

model accuracy 0.903846153846


In [101]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels1 = test1.map(lambda lp: (float(model1.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics1 = MulticlassMetrics(predictionAndLabels1)
metrics1.confusionMatrix()

DenseMatrix(2, 2, [135.0, 17.0, 3.0, 53.0], 0)

2. Second Method(with intercept)

In [138]:
#brca_df.select(['diagnosis','f1','f2','f3','f4','f5']).show(2)
assembler2 = VectorAssembler( inputCols=["f1", "f2", "f3",'f4','f5'], outputCol="features")
transformed2 = assembler2.transform(brca_df)
#transformed2.select(["diagnosis", "features"]).show(2)
# convert to RDD
dataRdd2 = transformed2.select(["diagnosis","features"]).rdd.map(tuple)
# Look at some data
#dataRdd2.take(2)
lp2 = dataRdd2.map(lambda row : (1 if row[0]=='M' else 0, Vectors.dense(row[1])))    \
            .map(lambda row : LabeledPoint(row[0], row[1]))
#lp2.take(2)
training2,test2=lp2.randomSplit([0.6,0.4],seed=314)
(1.0 * training2.count()/lp2.count(), 1.0 * test2.count()/lp2.count(), 1.0 * lp2.count()/lp2.count())

(0.6344463971880492, 0.3655536028119508, 1.0)

In [140]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
model2 = LogisticRegressionWithLBFGS.train(training2,intercept=True)

In [141]:
labelsAndPreds2=test2.map(lambda p: (p.label, model2.predict(p.features)))
accuracy2 = 1.0 * labelsAndPreds2.filter(lambda pl: pl[0] == pl[1]).count() / test2.count()
print('model accuracy {}'.format(accuracy2))

model accuracy 0.927884615385


In [142]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels2 = test2.map(lambda lp: (float(model2.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics2 = MulticlassMetrics(predictionAndLabels2)
metrics2.confusionMatrix()

DenseMatrix(2, 2, [137.0, 14.0, 1.0, 56.0], 0)

3. Third Method, change training and test to 70% and 30% with all features

In [176]:
#brca_df.select(['diagnosis','f1','f2','f3','f4','f5']).show(2)
assembler3 = VectorAssembler( inputCols=['f1','f2','f3','f4','f5'], outputCol="features")
transformed3 = assembler3.transform(brca_df)
#transformed3.select(["diagnosis", "features"]).show(2)
# convert to RDD
dataRdd3 = transformed3.select(["diagnosis","features"]).rdd.map(tuple)
# Look at some data
#dataRdd3.take(2)
lp3 = dataRdd3.map(lambda row : (1 if row[0]=='M' else 0, Vectors.dense(row[1])))    \
            .map(lambda row : LabeledPoint(row[0], row[1]))
#lp3.take(2)
training3,test3=lp3.randomSplit([0.7,0.3],seed=314)
(1.0 * training3.count()/lp3.count(), 1.0 * test3.count()/lp3.count(), 1.0 * lp3.count()/lp3.count())

(0.7117750439367311, 0.28822495606326887, 1.0)

In [177]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
model3 = LogisticRegressionWithLBFGS.train(training3, intercept=False)

In [178]:
labelsAndPreds3=test3.map(lambda p: (p.label, model3.predict(p.features)))
accuracy3 = 1.0 * labelsAndPreds3.filter(lambda pl: pl[0] == pl[1]).count() / test3.count()
print('model accuracy {}'.format(accuracy3))

model accuracy 0.896341463415


In [179]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels3 = test3.map(lambda lp: (float(model3.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics3 = MulticlassMetrics(predictionAndLabels3)
metrics3.confusionMatrix()

DenseMatrix(2, 2, [106.0, 15.0, 2.0, 41.0], 0)

4. Fourth Method, update Third Methed with intercept

In [180]:
#brca_df.select(['diagnosis','f1','f2','f3','f4','f5']).show(2)
assembler4 = VectorAssembler( inputCols=['f1','f2','f3','f4','f5'], outputCol="features")
transformed4 = assembler4.transform(brca_df)
#transformed4.select(["diagnosis", "features"]).show(2)
# convert to RDD
dataRdd4 = transformed4.select(["diagnosis","features"]).rdd.map(tuple)
# Look at some data
#dataRdd4.take(2)
lp4 = dataRdd4.map(lambda row : (1 if row[0]=='M' else 0, Vectors.dense(row[1])))    \
            .map(lambda row : LabeledPoint(row[0], row[1]))
#lp4.take(2)
training4,test4=lp4.randomSplit([0.7,0.3],seed=314)
(1.0 * training4.count()/lp4.count(), 1.0 * test4.count()/lp4.count(), 1.0 * lp4.count()/lp4.count())

(0.7117750439367311, 0.28822495606326887, 1.0)

In [181]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
model4 = LogisticRegressionWithLBFGS.train(training4, intercept=True)

In [182]:
labelsAndPreds4=test4.map(lambda p: (p.label, model4.predict(p.features)))
accuracy4 = 1.0 * labelsAndPreds4.filter(lambda pl: pl[0] == pl[1]).count() / test4.count()
print('model accuracy {}'.format(accuracy4))

model accuracy 0.926829268293


In [183]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels4 = test4.map(lambda lp: (float(model4.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics4 = MulticlassMetrics(predictionAndLabels4)
metrics4.confusionMatrix()

DenseMatrix(2, 2, [107.0, 11.0, 1.0, 45.0], 0)

In [62]:
import pandas as pd
out = pd.DataFrame(columns=['method','accuracy','truepos','trueneg','falsepos','falseneg'],index=[1,2,3,4,5])
out

Unnamed: 0,method,accuracy,truepos,trueneg,falsepos,falseneg
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
5,,,,,,


In [185]:
# Here is where you update the values with real data
out.iloc[0]['method']='original'; out.iloc[0]['accuracy']=0.658008658009; out.iloc[0]['truepos']=97;out.iloc[0]['trueneg']=42; out.iloc[0]['falsepos']=41;out.iloc[0]['falseneg']=28
out.iloc[1]['method']='first method'; out.iloc[1]['accuracy']=0.903846153846; out.iloc[1]['truepos']=135;out.iloc[1]['trueneg']=53; out.iloc[1]['falsepos']=3;out.iloc[1]['falseneg']=17
out.iloc[2]['method']='Second method'; out.iloc[2]['accuracy']=0.927884615385; out.iloc[2]['truepos']=137;out.iloc[2]['trueneg']=56; out.iloc[2]['falsepos']=1;out.iloc[2]['falseneg']=14
out.iloc[3]['method']='Third method'; out.iloc[3]['accuracy']=0.896341463415; out.iloc[3]['truepos']=106;out.iloc[3]['trueneg']=41; out.iloc[3]['falsepos']=2;out.iloc[3]['falseneg']=15
out.iloc[4]['method']='Fourth method'; out.iloc[4]['accuracy']=0.926829268293; out.iloc[4]['truepos']=107;out.iloc[4]['trueneg']=45; out.iloc[4]['falsepos']=1;out.iloc[4]['falseneg']=11
out

Unnamed: 0,method,accuracy,truepos,trueneg,falsepos,falseneg
1,original,0.658009,97,42,41,28
2,first method,0.903846,135,53,3,17
3,Second method,0.927885,137,56,1,14
4,Third method,0.896341,106,41,2,15
5,Fourth method,0.926829,107,45,1,11
