In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Python Spark Unit") \
    .master("local[2]") \
    .config("spark.driver.memory","2g") \
    .config("spark.executor.memory","2g") \
    .getOrCreate()

In [2]:
path = 'C:\\Users\\Yeojun\\Documents\\GitHub\\Programmers_DevCourse\\학습내용\\빅데이터 처리 시스템, 하둡과 Spark\\data'
df = spark.read.option("header",True).csv(path + '\\name_gender.csv')
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)



In [4]:
df.count()

100

In [3]:
df.createOrReplaceTempView("namegender")
spark.sql("SELECT gender, COUNT(1) count FROM namegender GROUP BY 1").show()

+------+-----+
|gender|count|
+------+-----+
|     F|   65|
|     M|   28|
|Unisex|    7|
+------+-----+



### upper_udf_f UDF를 테스트

In [5]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import *
import pandas as pd

@pandas_udf(StringType())
def upper_udf_f(s: pd.Series) -> pd.Series:
    return s.str.upper()

upperUDF = spark.udf.register("upper_udf", upper_udf_f)

### load_gender와 get_gender_count 함수를 테스트

In [6]:
def load_gender(spark, file_path):
    return spark.read.option("header", True).csv(file_path)

def get_gender_count(spark, df, field_to_count):
    df.createOrReplaceTempView("namegender_test")
    return spark.sql(f"SELECT {field_to_count}, count(1) count FROM namegender_test GROUP BY 1")

In [7]:
df = load_gender(spark, path+"/name_gender.csv")
get_gender_count(spark, df, "gender").show()
df.select(upperUDF("name").alias("NAME")).show()

+------+-----+
|gender|count|
+------+-----+
|     F|   65|
|     M|   28|
|Unisex|    7|
+------+-----+

+----------+
|      NAME|
+----------+
|  ADALEIGH|
|     AMRYN|
|    APURVA|
|    ARYION|
|    ALIXIA|
|ALYSSAROSE|
|    ARVELL|
|     AIBEL|
|   ATIYYAH|
|     ADLIE|
|    ANYELY|
|    AAMONI|
|     AHMAN|
|    ARLANE|
|   ARMONEY|
|   ATZHIRY|
| ANTONETTE|
|   AKEELAH|
| ABDIKADIR|
|    ARINZE|
+----------+
only showing top 20 rows



In [None]:
df.select(upperUDF("name").alias("NAME")).collect()

### 유닛 테스트 코드 붙여보기

In [12]:
path + "\\name.gender.csv"

'C:\\Users\\Yeojun\\Documents\\GitHub\\Programmers_DevCourse\\학습내용\\빅데이터 처리 시스템, 하둡과 Spark\\data\\name.gender.csv'

In [None]:
from unittest import TestCase

# 일반적으로는 아래 함수가 정의된 모듈을 임포트하고 그걸 테스트
# - upper_udf_f
# - load_gender
# - get_gender_count
# Local Standalone Moder Spark으로 기능 테스트

# 이외에도 2가지 방법이 더 존재
# - from pyspark.sql.tests import SparkTestingBase
# - pytest-spark (pytest testing framework plugin)

class UtilsTestCase(TestCase):
    spark = None
    path = "C:\\Users\\Yeojun\\Documents\\GitHub\\Programmers_DevCourse\\학습내용\\빅데이터 처리 시스템, 하둡과 Spark\\data"

    @classmethod
    def setUpClass(cls) -> None:
        cls.spark = SparkSession.builder \
            .appName("Spark Unit Test") \
            .getOrCreate()
        
    def test_datafile_loading(self):
        sample_df = load_gender(self.spark, path + "\\name_gender.csv")
        result_count = sample_df.count()
        self.assertEqual(result_count, 100, "Record count should be 100")

    def test_gender_count(self):
        sample_df = load_gender(self.spark, path + "\\name_gender.csv")
        count_list = get_gender_count(self.spark, sample_df, "gender").collect()
        count_dict = dict()
        for row in count_list:
            count_dict[row["gender"]] = row['count']
        self.assertEqual(count_dict["F"], 65, "Count for F should be 65")
        self.assertEqual(count_dict["M"], 28, "Count for M should be 28")
        self.assertEqual(count_dict["Unisex"], 7, "Count for Unisex should be 7")

    def test_upper_udf(self):
        test_data = [
            { "name": "John Kim" },
            { "name": "Johnny Kim"},
            { "name": "1234" }
        ]
        expected_results = [ "JOHN KIM", "JOHNNY KIM", "1234" ]

        upperUDF = self.spark.udf.register("upper_udf", upper_udf_f)
        test_df = self.spark.createDataFrame(test_data)
        names = test_df.select("name", upperUDF("name").alias("NAME")).collect()
        results = []
        for name in names:
            results.append(name["NAME"])
        self.assertCountEqual(results, expected_results)

    
    @classmethod
    def tearDownClass(cls) -> None:
        cls.spark.stop()

In [27]:
import unittest
unittest.main(argv=[''], verbosity=2, exit=False)

test_datafile_loading (__main__.UtilsTestCase) ... ok
  self._sock = None
ok
  self._sock = None
ok

----------------------------------------------------------------------
Ran 3 tests in 4.849s

OK


<unittest.main.TestProgram at 0x239029828f0>