In [7]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('241204_MoviesLens')
spark = SparkContext(conf = conf).getOrCreate()

In [8]:
spark

### u.data 데이터 설명
user_id : 사용자의 고유 ID  
movie_id : 영화 고유 ID  
rating : 사용자가 영화에 준 평점  
timestamp : 평점이 기록된 시간

In [9]:
import os
directory = os.path.join(os.getcwd(), 'data')
filename = 'u.data'
filepath = os.path.join(directory, filename)
filepath

'/home/lab17/git/src/data/u.data'

In [21]:
datas = spark.textFile('file:///' + filepath.replace('\\','/'))
datas.take(3)

['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']

In [30]:
rows = datas.filter(lambda x: x)

In [95]:
def parse(row):
    field = row.split('\t')
    user_id = field[0]
    movie_id = field[1]
    rating = field[2]
    timestamp = field[3]
    
    return (user_id, movie_id, rating, timestamp)

def parse_only(row):
    field = row.split('\t')
    rating = field[2]
    
    return rating


rows_parse = rows.map(parse)
rows_parse_only = rows.map(parse_only)

In [35]:
rows_parse.take(3)

[('196', '242', '3', '881250949'),
 ('186', '302', '3', '891717742'),
 ('22', '377', '1', '878887116')]

In [96]:
rows_parse_only.take(3)

['3', '3', '1']

### rating별 평점 개수

#### return이 하나인 함수로 단순하게 구하는 법

In [242]:
import pprint

pprint.pprint(rows_parse_only.sortBy(lambda x : x).countByValue())

defaultdict(<class 'int'>,
            {'1': 6110,
             '2': 11370,
             '3': 27145,
             '4': 34174,
             '5': 21201})


#### return이 4개인 함수에서 groupBy로 구하는 법

In [83]:
rating_res = rows_parse.map(lambda x : int(x[2])).groupBy(lambda x : x).sortByKey(lambda x : [1,2,3,4,5]).collect()
rating_res

[(1, <pyspark.resultiterable.ResultIterable at 0x7fabb574ac10>),
 (2, <pyspark.resultiterable.ResultIterable at 0x7fabb5749040>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7fabb5749460>),
 (4, <pyspark.resultiterable.ResultIterable at 0x7fabb57498e0>),
 (5, <pyspark.resultiterable.ResultIterable at 0x7fabb57493a0>)]

In [92]:
for i,j in rating_res:
    print(i, len(list(j)))

1 6110
2 11370
3 27145
4 34174
5 21201


### 다른 컬럼

#### 영화ID별 평가 개수

In [278]:
movieID_res = rows_parse.map(lambda x: x[1]).groupBy(lambda x:x).sortBy(lambda x : x).take(100)

for i, j in movieID_res:
    print(i, len(j))

1 452
10 89
100 508
1000 10
1001 17
1002 8
1003 8
1004 9
1005 22
1006 23
1007 47
1008 37
1009 64
101 73
1010 44
1011 93
1012 100
1013 38
1014 98
1015 12
1016 137
1017 50
1018 32
1019 31
102 54
1020 35
1021 38
1022 32
1023 31
1024 15
1025 44
1026 4
1027 3
1028 148
1029 14
103 15
1030 20
1031 7
1032 16
1033 32
1034 27
1035 68
1036 24
1037 24
1038 17
1039 90
104 5
1040 25
1041 62
1042 28
1043 8
1044 40
1045 25
1046 46
1047 134
1048 73
1049 25
105 74
1050 43
1051 41
1052 25
1053 24
1054 23
1055 10
1056 10
1057 22
1058 15
1059 35
106 71
1060 39
1061 29
1062 12
1063 41
1064 4
1065 53
1066 16
1067 44
1068 12
1069 18
107 42
1070 27
1071 16
1072 7
1073 66
1074 77
1075 7
1076 12
1077 8
1078 22
1079 45
108 65
1080 2
1081 8
1082 8
1083 6
1084 21
1085 11
1086 21
1087 10
1088 13


#### UserID별 평가 개수

In [279]:
userID_res = rows_parse.map(lambda x : x[0]).groupBy(lambda x: x).sortBy(lambda x : x).take(100)

for i in userID_res:
    print(f' USER_ID {i[0]}가 남긴 댓글 수 : {len(list(i[1]))}')

 USER_ID 1가 남긴 댓글 수 : 272
 USER_ID 10가 남긴 댓글 수 : 184
 USER_ID 100가 남긴 댓글 수 : 59
 USER_ID 101가 남긴 댓글 수 : 67
 USER_ID 102가 남긴 댓글 수 : 216
 USER_ID 103가 남긴 댓글 수 : 29
 USER_ID 104가 남긴 댓글 수 : 111
 USER_ID 105가 남긴 댓글 수 : 23
 USER_ID 106가 남긴 댓글 수 : 64
 USER_ID 107가 남긴 댓글 수 : 22
 USER_ID 108가 남긴 댓글 수 : 33
 USER_ID 109가 남긴 댓글 수 : 234
 USER_ID 11가 남긴 댓글 수 : 181
 USER_ID 110가 남긴 댓글 수 : 133
 USER_ID 111가 남긴 댓글 수 : 24
 USER_ID 112가 남긴 댓글 수 : 46
 USER_ID 113가 남긴 댓글 수 : 51
 USER_ID 114가 남긴 댓글 수 : 48
 USER_ID 115가 남긴 댓글 수 : 92
 USER_ID 116가 남긴 댓글 수 : 143
 USER_ID 117가 남긴 댓글 수 : 86
 USER_ID 118가 남긴 댓글 수 : 71
 USER_ID 119가 남긴 댓글 수 : 181
 USER_ID 12가 남긴 댓글 수 : 51
 USER_ID 120가 남긴 댓글 수 : 26
 USER_ID 121가 남긴 댓글 수 : 74
 USER_ID 122가 남긴 댓글 수 : 61
 USER_ID 123가 남긴 댓글 수 : 54
 USER_ID 124가 남긴 댓글 수 : 24
 USER_ID 125가 남긴 댓글 수 : 182
 USER_ID 126가 남긴 댓글 수 : 45
 USER_ID 127가 남긴 댓글 수 : 23
 USER_ID 128가 남긴 댓글 수 : 184
 USER_ID 129가 남긴 댓글 수 : 30
 USER_ID 13가 남긴 댓글 수 : 636
 USER_ID 130가 남긴 댓글 수 : 353
 USER_ID 131가 남긴 댓글 수

#### 평점이 3점 이상인 것만 뽑아서 평균 평점 계산하기

In [173]:
filtered_over3 = rows_parse.map(lambda x : int(x[2])).filter(lambda x : x >= 3)
print(f'3점 이상인 영화의 평균 평점 {filtered_over3.sum() / filtered_over3.count():.2f}점')

3점 이상인 영화의 평균 평점 3.93점


#### 영화ID가 1000 이상인 데이터

In [280]:
rows_parse.filter(lambda x : int(x[1]) > 1000).take(100)

[('286', '1014', '5', '879781125'),
 ('291', '1042', '4', '874834944'),
 ('234', '1184', '2', '892079237'),
 ('181', '1081', '1', '878962623'),
 ('242', '1137', '5', '879741196'),
 ('92', '1049', '1', '890251826'),
 ('254', '1444', '3', '886475558'),
 ('87', '1016', '4', '879876194'),
 ('119', '1153', '5', '874781198'),
 ('262', '1147', '4', '879791710'),
 ('194', '1211', '2', '879551380'),
 ('82', '1134', '2', '884714402'),
 ('279', '1336', '1', '875298353'),
 ('279', '1240', '1', '892174404'),
 ('268', '1035', '2', '875542174'),
 ('94', '1217', '3', '891723086'),
 ('293', '1267', '3', '888906966'),
 ('92', '1079', '3', '886443455'),
 ('58', '1098', '4', '884304936'),
 ('276', '1091', '3', '874793035'),
 ('194', '1028', '2', '879541148'),
 ('299', '1018', '3', '889502324'),
 ('130', '1014', '3', '876250718'),
 ('181', '1295', '1', '878961781'),
 ('308', '1197', '4', '887739521'),
 ('63', '1067', '3', '875747514'),
 ('90', '1086', '4', '891384424'),
 ('272', '1393', '2', '879454663'),


#### 가장 많이 평가된 상위 10개 영화 찾기


In [330]:
rows_parse.map(lambda x : (x[1], 1)).reduceByKey(lambda a, b : a+b).sortBy(lambda x : x[1], ascending = False).take(10)

[('50', 583),
 ('258', 509),
 ('100', 508),
 ('181', 507),
 ('294', 485),
 ('286', 481),
 ('288', 478),
 ('1', 452),
 ('300', 431),
 ('121', 429)]

#### 집합 연산

In [269]:
a_table = rows_parse.filter(lambda x : int(x[1]) > 1000).sample(False, 0.1)
b_table = rows_parse.filter(lambda x : int(x[1]) > 1000).sample(False, 0.1)

In [270]:
# 개수 확인
a_table.count(), b_table.count(), a_table.count() + b_table.count()

(793, 795, 1588)

##### 교집합

In [271]:
a_table.intersection(b_table).sortBy(lambda x : x).collect()

[('110', '1229', '3', '886988374'),
 ('137', '1028', '5', '881433409'),
 ('152', '1053', '5', '882475618'),
 ('160', '1019', '5', '876857977'),
 ('181', '1215', '1', '878963304'),
 ('181', '1338', '1', '878962240'),
 ('183', '1090', '2', '891467546'),
 ('194', '1044', '2', '879524579'),
 ('194', '1211', '2', '879551380'),
 ('234', '1200', '3', '892333865'),
 ('234', '1449', '4', '892333573'),
 ('256', '1150', '5', '882152570'),
 ('269', '1071', '2', '891449801'),
 ('269', '1091', '2', '891451705'),
 ('276', '1253', '1', '874795729'),
 ('279', '1162', '3', '875314334'),
 ('280', '1063', '3', '891700607'),
 ('286', '1060', '5', '889652989'),
 ('295', '1170', '5', '879966498'),
 ('303', '1303', '3', '879543831'),
 ('307', '1140', '2', '879114143'),
 ('314', '1520', '3', '877892052'),
 ('374', '1033', '4', '883628021'),
 ('399', '1178', '3', '882350341'),
 ('405', '1311', '1', '885546859'),
 ('417', '1288', '1', '879646741'),
 ('457', '1039', '5', '882397934'),
 ('474', '1014', '3', '88791

##### 합집합

In [272]:
print(a_table.union(b_table).count())
print(a_table.union(b_table).count() == (a_table.count() + b_table.count()))

1588
True


##### 차집합

In [273]:
a_table.subtract(b_table).collect()

[('243', '1039', '4', '879988184'),
 ('95', '1217', '3', '880572658'),
 ('239', '1070', '5', '889179032'),
 ('42', '1028', '4', '881106072'),
 ('314', '1150', '4', '877887002'),
 ('181', '1094', '1', '878963086'),
 ('87', '1079', '2', '879877240'),
 ('279', '1182', '3', '875314370'),
 ('271', '1266', '2', '885848943'),
 ('94', '1119', '4', '891723261'),
 ('221', '1067', '3', '875244387'),
 ('56', '1090', '3', '892683641'),
 ('181', '1198', '1', '878962585'),
 ('128', '1141', '4', '879968827'),
 ('303', '1508', '1', '879544130'),
 ('181', '1362', '1', '878962200'),
 ('303', '1267', '3', '879484327'),
 ('242', '1152', '5', '879741196'),
 ('92', '1041', '3', '875907675'),
 ('189', '1098', '4', '893265506'),
 ('145', '1212', '2', '875272196'),
 ('321', '1194', '5', '879438607'),
 ('181', '1120', '1', '878962279'),
 ('179', '1316', '3', '892151489'),
 ('303', '1224', '2', '879485475'),
 ('130', '1228', '3', '878537681'),
 ('67', '1093', '5', '875379419'),
 ('276', '1239', '1', '874977512'),

In [274]:
b_table.subtract(a_table).take(b_table.subtract(a_table).count())

[('242', '1137', '5', '879741196'),
 ('192', '1061', '4', '881368891'),
 ('230', '1444', '2', '880485726'),
 ('181', '1334', '1', '878962240'),
 ('299', '1047', '2', '877880041'),
 ('294', '1199', '2', '889242142'),
 ('303', '1095', '2', '879543988'),
 ('89', '1048', '3', '879460027'),
 ('87', '1183', '3', '879875995'),
 ('174', '1014', '3', '890664424'),
 ('194', '1183', '2', '879554453'),
 ('184', '1137', '5', '889907812'),
 ('279', '1017', '3', '875296891'),
 ('207', '1242', '5', '884386260'),
 ('119', '1170', '3', '890627339'),
 ('181', '1067', '1', '878962550'),
 ('224', '1119', '3', '888082634'),
 ('99', '1052', '1', '885679533'),
 ('343', '1107', '3', '876406977'),
 ('189', '1372', '4', '893264429'),
 ('328', '1126', '3', '885046580'),
 ('323', '1017', '3', '878739394'),
 ('234', '1463', '5', '892333573'),
 ('110', '1206', '3', '886988321'),
 ('222', '1089', '1', '877563659'),
 ('68', '1047', '1', '876974379'),
 ('94', '1032', '2', '891723807'),
 ('167', '1126', '5', '892738418'

### 여러가지 메서드

In [275]:
type(a_table)

pyspark.rdd.PipelinedRDD

In [276]:
dir(a_table)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_bypass_serializer',
 '_computeFractionForSampleSize',
 '_defaultReducePartitions',
 '_id',
 '_is_barrier',
 '_is_pipelinable',
 '_jrdd',
 '_jrdd_deserializer',
 '_jrdd_val',
 '_memory_limit',
 '_pickled',
 '_prev_jrdd',
 '_prev_jrdd_deserializer',
 '_reserialize',
 '_to_java_object_rdd',
 'aggregate',
 'aggregateByKey',
 'barrier',
 'cache',
 'cartesian',
 'checkpoint',
 'coalesce',
 'cogroup',
 'collect',
 'collectAsMap',
 'collectWithJobGroup',
 'combineByKey',
 'context',
 'count',
 'countApprox',
 'countApproxDistinct',
 'countByKey',
 'countByValue',
 'ctx',
 'distinct',
 'filter',
 '

In [277]:
a_table.distinct(numPartitions = 1).take(100)

[('194', '1211', '2', '879551380'),
 ('26', '1015', '3', '891352136'),
 ('181', '1015', '1', '878963121'),
 ('243', '1039', '4', '879988184'),
 ('62', '1016', '4', '879373008'),
 ('95', '1217', '3', '880572658'),
 ('239', '1070', '5', '889179032'),
 ('42', '1028', '4', '881106072'),
 ('130', '1217', '4', '875801778'),
 ('279', '1162', '3', '875314334'),
 ('314', '1150', '4', '877887002'),
 ('94', '1048', '4', '891722678'),
 ('181', '1094', '1', '878963086'),
 ('110', '1246', '2', '886989613'),
 ('87', '1079', '2', '879877240'),
 ('279', '1182', '3', '875314370'),
 ('67', '1095', '4', '875379287'),
 ('92', '1142', '4', '886442422'),
 ('221', '1250', '2', '875247855'),
 ('271', '1266', '2', '885848943'),
 ('94', '1119', '4', '891723261'),
 ('280', '1028', '5', '891702276'),
 ('221', '1067', '3', '875244387'),
 ('56', '1090', '3', '892683641'),
 ('181', '1198', '1', '878962585'),
 ('296', '1009', '3', '884196921'),
 ('262', '1278', '4', '879961819'),
 ('64', '1133', '4', '889739975'),
 ('

numPartitions의 역할  
파티션 수를 늘리면 작업이 더 많은 노드에서 병렬로 수행될 수 있어 성능이 향상됩니다.  
너무 많은 파티션을 설정하면 오히려 오버헤드가 증가할 수 있으므로 데이터 크기와 클러스터 구성에 따라 적절한 값을 선택해야 합니다.  