In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("Learning_Spark") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.memory", f'{int(2000/4.4)}mb') \
    .config("spark.executor.cores", 2) \
    .getOrCreate()

sc = spark.sparkContext

In [60]:
def good_print(rdd, num):
    for row in rdd.take(num):
        print(row)

In [61]:
test_data = sc.textFile('test.csv')
train_data = sc.textFile('train.csv')

In [62]:
train_data.take(3)

['battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range',
 '842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1',
 '1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2']

In [63]:
test_data.take(3)

['id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi',
 '1,1043,1,1.8,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0',
 '2,841,1,0.5,1,4,1,61,0.8,191,5,12,746,857,3895,6,0,7,1,0,0']

In [64]:
test_header = test_data.first()
train_header = train_data.first()

In [65]:
A = set(test_header.split(',')) 
B = set(train_header.split(',')) 
difference = ', '.join(A ^ B)
print(f'Файлы test.csv и train.csv отличаются колонками: {difference}')

Файлы test.csv и train.csv отличаются колонками: id, price_range


In [66]:
test_data = test_data.filter(lambda row: row != test_header)\
           .map(lambda row: [float(col) for col in row.split(',')])
good_print(test_data, 3)

[1.0, 1043.0, 1.0, 1.8, 1.0, 14.0, 0.0, 5.0, 0.1, 193.0, 3.0, 16.0, 226.0, 1412.0, 3476.0, 12.0, 7.0, 2.0, 0.0, 1.0, 0.0]
[2.0, 841.0, 1.0, 0.5, 1.0, 4.0, 1.0, 61.0, 0.8, 191.0, 5.0, 12.0, 746.0, 857.0, 3895.0, 6.0, 0.0, 7.0, 1.0, 0.0, 0.0]
[3.0, 1807.0, 1.0, 2.8, 0.0, 1.0, 0.0, 27.0, 0.9, 186.0, 3.0, 4.0, 1270.0, 1366.0, 2396.0, 17.0, 10.0, 10.0, 0.0, 1.0, 1.0]


In [67]:
train_data = train_data.filter(lambda row: row != train_header)\
           .map(lambda row: [float(col) for col in row.split(',')])
good_print(train_data, 3)

[842.0, 0.0, 2.2, 0.0, 1.0, 0.0, 7.0, 0.6, 188.0, 2.0, 2.0, 20.0, 756.0, 2549.0, 9.0, 7.0, 19.0, 0.0, 0.0, 1.0, 1.0]
[1021.0, 1.0, 0.5, 1.0, 0.0, 1.0, 53.0, 0.7, 136.0, 3.0, 6.0, 905.0, 1988.0, 2631.0, 17.0, 3.0, 7.0, 1.0, 1.0, 0.0, 2.0]
[563.0, 1.0, 0.5, 1.0, 2.0, 1.0, 41.0, 0.9, 145.0, 5.0, 6.0, 1263.0, 1716.0, 2603.0, 11.0, 2.0, 9.0, 1.0, 1.0, 0.0, 2.0]


In [68]:
n_cores = header.split(',').index('n_cores')
wifi = header.split(',').index('wifi')

In [69]:
train_data.map(lambda row: (row[wifi],1)).countByKey()

defaultdict(int, {1.0: 500, 2.0: 500, 3.0: 500, 0.0: 500})

In [70]:
test_data.map(lambda row: (row[wifi],1)).countByKey()

defaultdict(int, {0.0: 493, 1.0: 507})

In [71]:
d = dict(test_data.map(lambda row: (row[n_cores],1)).countByKey())
d = dict(sorted(d.items(),key= lambda x:x[0]))
print(f'Counts n_cores: {d}')

Counts n_cores: {1.0: 138, 2.0: 134, 3.0: 127, 4.0: 142, 5.0: 130, 6.0: 101, 7.0: 107, 8.0: 121}


In [72]:
d = dict(train_data.map(lambda row: (row[n_cores],1)).countByKey())
d = dict(sorted(d.items(),key= lambda x:x[0]))
print(f'Counts n_cores: {d}')

Counts n_cores: {0.0: 101, 1.0: 104, 2.0: 99, 3.0: 93, 4.0: 95, 5.0: 59, 6.0: 95, 7.0: 119, 8.0: 89, 9.0: 112, 10.0: 122, 11.0: 79, 12.0: 90, 13.0: 85, 14.0: 104, 15.0: 92, 16.0: 88, 17.0: 99, 18.0: 82, 19.0: 83, 20.0: 110}


В файле train.csv значений в каждой совпадающей колонке больше и они разнообразнее чем в файле test.csv.