In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local[2]")\
        .appName("Pyspark DataFrame #4")\
        .getOrCreate()

In [9]:
df = spark.read.csv("./data/survey_results_public.csv", header=True\
                    ).select('ResponseId', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith')
df.printSchema()

root
 |-- ResponseId: string (nullable = true)
 |-- LanguageHaveWorkedWith: string (nullable = true)
 |-- LanguageWantToWorkWith: string (nullable = true)



In [10]:
import pyspark.sql.functions as F

# LanguageHaveWorkedWith 값을 트림하고 ;를 가지고 나눠서 리스트의 형태로 language_have 필드로 설정
df2 = df.withColumn(
    "language_have",
    F.split(F.trim(F.col("LanguageHaveWorkedWith")), ";")
)
df2.show(5)


+----------+----------------------+----------------------+--------------------+
|ResponseId|LanguageHaveWorkedWith|LanguageWantToWorkWith|       language_have|
+----------+----------------------+----------------------+--------------------+
|         1|  C++;HTML/CSS;Java...|                 Swift|[C++, HTML/CSS, J...|
|         2|     JavaScript;Python|                    NA|[JavaScript, Python]|
|         3|  Assembly;C;Python...|     Julia;Python;Rust|[Assembly, C, Pyt...|
|         4|  JavaScript;TypeSc...|  JavaScript;TypeSc...|[JavaScript, Type...|
|         5|  Bash/Shell;HTML/C...|  Bash/Shell;HTML/C...|[Bash/Shell, HTML...|
+----------+----------------------+----------------------+--------------------+
only showing top 5 rows



In [11]:
df3 = df2.withColumn(
    "language_want",
    F.split(F.trim(F.col("LanguageWantToWorkWith")), ";")
)
df3.printSchema()

root
 |-- ResponseId: string (nullable = true)
 |-- LanguageHaveWorkedWith: string (nullable = true)
 |-- LanguageWantToWorkWith: string (nullable = true)
 |-- language_have: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- language_want: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [12]:
df3.show(5)

+----------+----------------------+----------------------+--------------------+--------------------+
|ResponseId|LanguageHaveWorkedWith|LanguageWantToWorkWith|       language_have|       language_want|
+----------+----------------------+----------------------+--------------------+--------------------+
|         1|  C++;HTML/CSS;Java...|                 Swift|[C++, HTML/CSS, J...|             [Swift]|
|         2|     JavaScript;Python|                    NA|[JavaScript, Python]|                [NA]|
|         3|  Assembly;C;Python...|     Julia;Python;Rust|[Assembly, C, Pyt...|[Julia, Python, R...|
|         4|  JavaScript;TypeSc...|  JavaScript;TypeSc...|[JavaScript, Type...|[JavaScript, Type...|
|         5|  Bash/Shell;HTML/C...|  Bash/Shell;HTML/C...|[Bash/Shell, HTML...|[Bash/Shell, HTML...|
+----------+----------------------+----------------------+--------------------+--------------------+
only showing top 5 rows



## 현재 많이 사용되는 언어들 찾기

In [14]:
## explode 리스트에 있는 원소를 행으로 변환
df_language_have = df3.select(
    df3.ResponseId,
    F.explode(df3.language_have).alias("language_have")
)
df_language_have.show(10)

+----------+-------------+
|ResponseId|language_have|
+----------+-------------+
|         1|          C++|
|         1|     HTML/CSS|
|         1|   JavaScript|
|         1|  Objective-C|
|         1|          PHP|
|         1|        Swift|
|         2|   JavaScript|
|         2|       Python|
|         3|     Assembly|
|         3|            C|
+----------+-------------+
only showing top 10 rows



In [15]:
df_language_have.groupby("language_have").count().show(10)

+-------------+-----+
|language_have|count|
+-------------+-----+
|           C#|22984|
|          VBA| 3847|
|         Rust| 5799|
|   Bash/Shell|22385|
|   JavaScript|53587|
|           NA| 1082|
|         Perl| 2028|
|       Erlang|  651|
|       Matlab| 3846|
|      Crystal|  466|
+-------------+-----+
only showing top 10 rows



Sorting 두 가지 방법
- sort & orderBy
- ascending & descending

In [16]:
df_language_have.groupby("language_have").count().sort(F.desc("count")).collect()

[Row(language_have='JavaScript', count=53587),
 Row(language_have='HTML/CSS', count=46259),
 Row(language_have='Python', count=39792),
 Row(language_have='SQL', count=38835),
 Row(language_have='Java', count=29162),
 Row(language_have='Node.js', count=27975),
 Row(language_have='TypeScript', count=24909),
 Row(language_have='C#', count=22984),
 Row(language_have='Bash/Shell', count=22385),
 Row(language_have='C++', count=20057),
 Row(language_have='PHP', count=18130),
 Row(language_have='C', count=17329),
 Row(language_have='PowerShell', count=8871),
 Row(language_have='Go', count=7879),
 Row(language_have='Kotlin', count=6866),
 Row(language_have='Rust', count=5799),
 Row(language_have='Ruby', count=5569),
 Row(language_have='Dart', count=4965),
 Row(language_have='Assembly', count=4632),
 Row(language_have='Swift', count=4204),
 Row(language_have='R', count=4185),
 Row(language_have='VBA', count=3847),
 Row(language_have='Matlab', count=3846),
 Row(language_have='Groovy', count=2479)

In [17]:
df_language_have.groupby("language_have").count().orderBy('count', ascending=False).collect()

[Row(language_have='JavaScript', count=53587),
 Row(language_have='HTML/CSS', count=46259),
 Row(language_have='Python', count=39792),
 Row(language_have='SQL', count=38835),
 Row(language_have='Java', count=29162),
 Row(language_have='Node.js', count=27975),
 Row(language_have='TypeScript', count=24909),
 Row(language_have='C#', count=22984),
 Row(language_have='Bash/Shell', count=22385),
 Row(language_have='C++', count=20057),
 Row(language_have='PHP', count=18130),
 Row(language_have='C', count=17329),
 Row(language_have='PowerShell', count=8871),
 Row(language_have='Go', count=7879),
 Row(language_have='Kotlin', count=6866),
 Row(language_have='Rust', count=5799),
 Row(language_have='Ruby', count=5569),
 Row(language_have='Dart', count=4965),
 Row(language_have='Assembly', count=4632),
 Row(language_have='Swift', count=4204),
 Row(language_have='R', count=4185),
 Row(language_have='VBA', count=3847),
 Row(language_have='Matlab', count=3846),
 Row(language_have='Groovy', count=2479)

In [19]:
df_language50_have = df_language_have.groupby("language_have")\
    .count()\
    .orderBy('count', ascending=False)\
    .limit(50)
df_language50_have.write.mode('overwrite').csv("language50_have")

In [22]:
import pandas as pd
temp = pd.read_csv("./language50_have/part-00000-ed1b4d2b-dc3e-4128-847a-61af77a3a92c-c000.csv",header=None)
temp.head(20)

Unnamed: 0,0,1
0,JavaScript,53587
1,HTML/CSS,46259
2,Python,39792
3,SQL,38835
4,Java,29162
5,Node.js,27975
6,TypeScript,24909
7,C#,22984
8,Bash/Shell,22385
9,C++,20057


## 가장 배우고 싶은 언어 찾기

In [26]:
df_language_want = df3.select(
    df3.ResponseId,
    F.explode(df3.language_want).alias("language_want")
)
df_language_want.show(5)

+----------+-------------+
|ResponseId|language_want|
+----------+-------------+
|         1|        Swift|
|         2|           NA|
|         3|        Julia|
|         3|       Python|
|         3|         Rust|
+----------+-------------+
only showing top 5 rows



In [25]:
df_language_want.groupby("language_want").count().show(10)

+-------------+-----+
|language_want|count|
+-------------+-----+
|           C#|17999|
|          VBA| 1069|
|         Rust|15865|
|   Bash/Shell|14043|
|   JavaScript|37008|
|           NA| 6618|
|         Perl| 1175|
|       Erlang| 1379|
|       Matlab| 1562|
|      Crystal|  790|
+-------------+-----+
only showing top 10 rows



In [28]:
df_language50_want = df_language_want.groupby("language_want").count().orderBy('count', ascending=False)
df_language50_want.show(10)

+-------------+-----+
|language_want|count|
+-------------+-----+
|   JavaScript|37008|
|       Python|34929|
|     HTML/CSS|29353|
|   TypeScript|26905|
|          SQL|26631|
|      Node.js|24100|
|           C#|17999|
|         Java|17222|
|         Rust|15865|
|           Go|15788|
+-------------+-----+
only showing top 10 rows



In [29]:
df_language50_want.write.mode('overwrite').csv('language50_want')