In [31]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import json

In [46]:
spark = SparkSession.builder.appName("fsum").config("local", 4).getOrCreate()
sc = SparkContext.getOrCreate()

#select file name
fileName = "dummy.csv"
data = sc.textFile(fileName)
data.collect()

['6,10,50', '5,20,?', '4,10,60', '3,10,50', '2,?,70', '1,30,60']

In [47]:
data = data.filter(lambda x:not '?' in x)
data.collect()

['6,10,50', '4,10,60', '3,10,50', '1,30,60']

In [48]:
#read data
#sc = SparkContext("local", "fsum")
data = data.map(lambda x:x.split(','))
data.collect()

[['6', '10', '50'], ['4', '10', '60'], ['3', '10', '50'], ['1', '30', '60']]

In [49]:
data=data.map(lambda x: (x[0],x[1:])).sortByKey()
data.collect()

[('1', ['30', '60']),
 ('3', ['10', '50']),
 ('4', ['10', '60']),
 ('6', ['10', '50'])]

In [50]:
# Find frequency of each element in each column

records = data

#number of attributes
record_length = 2

#Dictionary of dictionaries of frequencies for each attribute
frequencies = {}

for i in range(0,record_length):  
    # we will use mapValues() instead of map() to take only values from the "records" RDD (key,value) pair 
    # we will store these value in arrays/lists 
    attribute = records.mapValues(lambda x: x[i])
    #count values and produce frequencies
    attribute_frequencies = attribute.values().countByValue()
    frequencies[i] = attribute_frequencies

print("Frequencies of the whole dataset:")

--------------------------------------------
Frequencies of the whole dataset:
{
  "0": {
    "30": 1,
    "10": 3
  },
  "1": {
    "60": 2,
    "50": 2
  }
}


In [54]:
# Replace each value in a column by its frequency, and find sum of each row frequency


#select K
K = 3

for i in range(0,record_length):
  #broadcast the attribute_frequencies calculated above
  bi = sc.broadcast(attribute_frequencies)
  #assign the appropriate frequency to each record's attribute from the broadcast
  attribute_freq = attribute.mapValues(lambda x:bi.value[x])
  #add attribute_freq to t (sum)
  if i:
    t = t.join(attribute_freq).mapValues(sum)
  else:
    t = attribute_freq

t = t.sortBy(lambda x:x[1])


print("K records with highest sum of frequencies:")
t.take(K)

K records with highest sum of frequencies:


[('1', 4), ('4', 4), ('3', 4)]

In [55]:
file_path = "forest.csv"
spark_df = spark.read.csv(file_path)
spark_df.count()

581012

In [56]:
spark_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)
 |-- _c23: string (nullable = true)
 |-- _c24: string (nullable = true)
 |-- _c25: string (nullable = true)
 |-- _c26: string (nullable = true)
 |-- _c27: string (nullable = tru

In [57]:
spark_df.show(5)   # by default shows 20 # collect() shows all rows

+---+---+---+---+----+---+---+---+----+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|_c0|_c1|_c2|_c3| _c4|_c5|_c6|_c7| _c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|_c41|_c42|_c43|_c44|_c45|_c46|_c47|_c48|_c49|_c50|_c51|_c52|_c53|
+---+---+---+---+----+---+---+---+----+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
| 51|  3|258|  0| 510|221|232|148|6279|  1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   0|

In [58]:
import pandas as pd
pandas_df = spark_df.toPandas()
pandas_df.head(5)

                                                                                

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,...,_c44,_c45,_c46,_c47,_c48,_c49,_c50,_c51,_c52,_c53
0,51,3,258,0,510,221,232,148,6279,1,...,0,0,0,0,0,0,0,0,0,5
1,56,2,212,-6,390,220,235,151,6225,1,...,0,0,0,0,0,0,0,0,0,5
2,139,9,268,65,3180,234,238,135,6121,1,...,0,0,0,0,0,0,0,0,0,2
3,155,18,242,118,3090,238,238,122,6211,1,...,0,0,0,0,0,0,0,0,0,2
4,45,2,153,-1,391,220,234,150,6172,1,...,0,0,0,0,0,0,0,0,0,5


In [14]:
spark_df = spark_df.dropna()
spark_df.count()

                                                                                

581012