In [1]:
import seaborn as sns

In [2]:
import tensorflow as tf

In [3]:
import pyspark

In [4]:
import random
NUM_SAMPLES = 100000
def inside(p):
 x, y = random.random(), random.random()
 return x*x + y*y < 1
count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()
pi = 4 * count / NUM_SAMPLES
print('Pi is roughly', pi)

Pi is roughly 3.1438


In [9]:
!spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/
                        
Using Scala version 2.11.12, Java HotSpot(TM) 64-Bit Server VM, 1.8.0_221
Branch 
Compiled by user  on 2019-05-01T05:08:38Z
Revision 
Url 
Type --help for more information.


In [None]:
import urllib
f = urllib.request.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

In [6]:
ls

kddcup.data_10_percent.gz       nb1-rdd-creation20190826.ipynb


### 1. Creating a RDD from a file
The most common way of creating an RDD is to load it from a file. Notice that Spark's textFile can handle compressed files directly.

In [7]:
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

**Now we have our data file loaded into the raw_data RDD.**

Without getting into Spark transformations and actions, the most basic thing we can do to check that we got our RDD contents right is to count() the number of lines loaded from the file into the RDD.

In [8]:
raw_data

./kddcup.data_10_percent.gz MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

In [10]:
raw_data.count()

494021

In [11]:
raw_data.take(5)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']

#### 1.1 check data type

In [13]:
type(raw_data)

pyspark.rdd.RDD

In [37]:
import pandas as pd
df = pd.DataFrame(raw_data.take(5))
df

Unnamed: 0,0
0,"0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0..."
1,"0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,..."
2,"0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0..."
3,"0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0..."
4,"0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0..."


### 2. Creating and RDD using parallelize
Another way of creating an RDD is to parallelize an already existing list.
#### 2.1 Iterator like range will lead to PipelineRDD

In [17]:
a = range(1000)
data = sc.parallelize(a)

In [19]:
type(data)

pyspark.rdd.PipelinedRDD

#### 2.2 list will lead to RDD instead of PipelineRDD
* check data type

In [20]:
al = list(range(1000))
data = sc.parallelize(al)

In [21]:
type(data)

pyspark.rdd.RDD

In [23]:
data.count()

1000

In [24]:
data.take(5)

[0, 1, 2, 3, 4]

In [32]:
data.first()

0

In [34]:
data.count()

1000