### 1. 데이터 RDD 생성

In [1]:
from pyspark import SparkConf, SparkContext
import os

conf = SparkConf().setMaster('local').setAppName('mnms')
spark = SparkContext(conf = conf).getOrCreate()

24/12/03 17:13:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


#### 1-1 경로 생성

In [2]:
directory = os.path.join(os.getcwd(), 'data')
file_name = 'mnm_dataset.csv'
file_path = os.path.join(directory, file_name).replace('\\','/')
file_path

'/home/lab17/git/src/data/mnm_dataset.csv'

#### 1-2 파일 불러오기

In [3]:
lines = spark.textFile('file:///' + file_path)
lines

file:////home/lab17/git/src/data/mnm_dataset.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [4]:
lines.take(5)

                                                                                

['State,Color,Count', 'TX,Red,20', 'NV,Blue,66', 'CO,Blue,79', 'OR,Blue,71']

### 1-3 헤더와 행 구분하기

In [7]:
headers = lines.first()
headers

'State,Color,Count'

In [8]:
rows = lines.filter(lambda r : r != headers)

#확인
rows.take(5)

['TX,Red,20', 'NV,Blue,66', 'CO,Blue,79', 'OR,Blue,71', 'WA,Yellow,93']

### 2. collect()

#### 2-1 parse 함수 정의

In [9]:
def parse(row):
    rows = row.split(',')
    state = rows[0]
    color = rows[1]
    count = int(rows[2])

    return (state, color, count)
    # 리스트로 return 하면 뒤에 countByValue 안먹힘

#### 2-2 함수에 RDD 객체 하나씩 map해서 넣기

In [10]:
table = rows.map(parse)
table.take(5)

[('TX', 'Red', 20),
 ('NV', 'Blue', 66),
 ('CO', 'Blue', 79),
 ('OR', 'Blue', 71),
 ('WA', 'Yellow', 93)]

#### 2-3 parse 돌린 table collect()

In [12]:
table.collect()

[('TX', 'Red', 20),
 ('NV', 'Blue', 66),
 ('CO', 'Blue', 79),
 ('OR', 'Blue', 71),
 ('WA', 'Yellow', 93),
 ('WY', 'Blue', 16),
 ('CA', 'Yellow', 53),
 ('WA', 'Green', 60),
 ('OR', 'Green', 71),
 ('TX', 'Green', 68),
 ('NV', 'Green', 59),
 ('AZ', 'Brown', 95),
 ('WA', 'Yellow', 20),
 ('AZ', 'Blue', 75),
 ('OR', 'Brown', 72),
 ('NV', 'Red', 98),
 ('WY', 'Orange', 45),
 ('CO', 'Blue', 52),
 ('TX', 'Brown', 94),
 ('CO', 'Red', 82),
 ('CO', 'Red', 12),
 ('CO', 'Red', 17),
 ('OR', 'Green', 16),
 ('AZ', 'Green', 46),
 ('NV', 'Red', 43),
 ('NM', 'Yellow', 15),
 ('WA', 'Red', 12),
 ('OR', 'Green', 13),
 ('CO', 'Blue', 95),
 ('WY', 'Red', 63),
 ('TX', 'Orange', 63),
 ('WY', 'Yellow', 48),
 ('OR', 'Green', 95),
 ('WA', 'Red', 75),
 ('CO', 'Orange', 93),
 ('NV', 'Orange', 10),
 ('WY', 'Green', 15),
 ('WA', 'Green', 99),
 ('CO', 'Blue', 98),
 ('CA', 'Green', 86),
 ('UT', 'Red', 92),
 ('AZ', 'Brown', 16),
 ('CA', 'Red', 100),
 ('UT', 'Red', 77),
 ('TX', 'Yellow', 29),
 ('WA', 'Orange', 73),
 ('WY', 

### 3. blue mnms만 꺼내기

#### 3-1 color 관련 컬럼 추출

In [13]:
colors = rows.map(lambda x : x.split(','))
colors.take(1)

[['TX', 'Red', '20']]

#### 3-2 Blue 값만 뽑기

In [14]:
colors.filter(lambda x :  x[1]== 'Blue').count()

16449

In [15]:
colors.filter(lambda x : x[1] == 'Blue').collect()

[['NV', 'Blue', '66'],
 ['CO', 'Blue', '79'],
 ['OR', 'Blue', '71'],
 ['WY', 'Blue', '16'],
 ['AZ', 'Blue', '75'],
 ['CO', 'Blue', '52'],
 ['CO', 'Blue', '95'],
 ['CO', 'Blue', '98'],
 ['CA', 'Blue', '13'],
 ['NV', 'Blue', '50'],
 ['TX', 'Blue', '49'],
 ['CA', 'Blue', '34'],
 ['UT', 'Blue', '97'],
 ['AZ', 'Blue', '59'],
 ['OR', 'Blue', '29'],
 ['OR', 'Blue', '54'],
 ['CO', 'Blue', '52'],
 ['WA', 'Blue', '59'],
 ['WY', 'Blue', '59'],
 ['WY', 'Blue', '53'],
 ['WA', 'Blue', '42'],
 ['WA', 'Blue', '37'],
 ['NV', 'Blue', '49'],
 ['NM', 'Blue', '15'],
 ['CA', 'Blue', '39'],
 ['UT', 'Blue', '48'],
 ['AZ', 'Blue', '82'],
 ['CO', 'Blue', '14'],
 ['WA', 'Blue', '18'],
 ['CO', 'Blue', '89'],
 ['TX', 'Blue', '80'],
 ['WA', 'Blue', '84'],
 ['NM', 'Blue', '46'],
 ['AZ', 'Blue', '16'],
 ['TX', 'Blue', '68'],
 ['CA', 'Blue', '99'],
 ['WY', 'Blue', '80'],
 ['CO', 'Blue', '44'],
 ['CO', 'Blue', '90'],
 ['TX', 'Blue', '19'],
 ['CA', 'Blue', '65'],
 ['TX', 'Blue', '98'],
 ['NM', 'Blue', '98'],
 ['AZ', 'Bl

### 4. 전체 합계 구하기

#### 4-1 데이터 확인

In [16]:
table.countByValue()

defaultdict(int,
            {('TX', 'Red', 20): 17,
             ('NV', 'Blue', 66): 18,
             ('CO', 'Blue', 79): 16,
             ('OR', 'Blue', 71): 17,
             ('WA', 'Yellow', 93): 19,
             ('WY', 'Blue', 16): 16,
             ('CA', 'Yellow', 53): 28,
             ('WA', 'Green', 60): 24,
             ('OR', 'Green', 71): 20,
             ('TX', 'Green', 68): 23,
             ('NV', 'Green', 59): 29,
             ('AZ', 'Brown', 95): 17,
             ('WA', 'Yellow', 20): 14,
             ('AZ', 'Blue', 75): 14,
             ('OR', 'Brown', 72): 20,
             ('NV', 'Red', 98): 21,
             ('WY', 'Orange', 45): 25,
             ('CO', 'Blue', 52): 20,
             ('TX', 'Brown', 94): 14,
             ('CO', 'Red', 82): 23,
             ('CO', 'Red', 12): 13,
             ('CO', 'Red', 17): 16,
             ('OR', 'Green', 16): 20,
             ('AZ', 'Green', 46): 17,
             ('NV', 'Red', 43): 19,
             ('NM', 'Yellow', 15): 19,
        

In [17]:
#전체 행 갯수
table_countby = table.countByValue()
sum([y for x, y in table_countby.items()])

99999

### 4-2 합계 구하기

In [18]:
table.take(2)

[('TX', 'Red', 20), ('NV', 'Blue', 66)]

In [19]:
table.map(lambda x : int(x[2])).sum()

5500035

### 5. 도시별 합계 구하기

In [20]:
city = table.map(lambda x: (x[0], int(x[2])))
city.take(3)

[('TX', 20), ('NV', 66), ('CO', 79)]

In [21]:
city.reduceByKey(lambda a, b : a+ b).collect()

                                                                                

[('TX', 556493),
 ('NV', 548477),
 ('CO', 556302),
 ('OR', 542169),
 ('WA', 557227),
 ('WY', 538975),
 ('CA', 561184),
 ('AZ', 546812),
 ('NM', 553454),
 ('UT', 538942)]

In [22]:
spark.stop()