In [15]:
import pyspark 
from pyspark.sql import SparkSession

### Create Session

In [16]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [17]:
spark

In [18]:
df_pyspark = spark.read.option('header', 'true').option('inferSchema', 'true').csv('dataset/sample.csv')

In [19]:
df_pyspark

DataFrame[name: string, age: int]

In [20]:
df_pyspark.show()

+-----+---+
| name|age|
+-----+---+
|  Tom| 27|
|Tommy| 28|
| Jeff| 30|
| OKim| 30|
+-----+---+



In [21]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



- PySpark Dataframe 
- Reading The Dataset
- Checking the Datatypes of the Column(Schema)
- Selecting Columns And Indexing 
- Check Describe option similar to Pandas 
- Adding Columns 
- Dropping Columns 
- Renaming Columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [9]:
## reading the dataset
df_pyspark = spark.read.option('header', 'true').option('inferSchema', 'true').csv('dataset/summary.csv')
df_pyspark = spark.read.option('header', 'true').csv('dataset/summary.csv', inferSchema=True)

In [10]:
df_pyspark = spark.read.csv('dataset/summary.csv', header = True, inferSchema = True)

In [11]:
df_pyspark

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

In [12]:
df_pyspark.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [8]:
### checking the schema
df_pyspark.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [13]:
# selecting columns and indexing
df_pyspark.columns 

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [15]:
df_pyspark.head(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=264),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=69)]

In [16]:
df_pyspark.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [19]:
df_pyspark.select('DEST_COUNTRY_NAME').show(3)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
+-----------------+
only showing top 3 rows



In [21]:
type(df_pyspark.select('DEST_COUNTRY_NAME'))

pyspark.sql.dataframe.DataFrame

In [23]:
df_pyspark.select(['DEST_COUNTRY_NAME', 'count']).show(3)

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|    United States|    1|
|    United States|  264|
|    United States|   69|
+-----------------+-----+
only showing top 3 rows



In [28]:
df_pyspark['count'] # 단순히 columns만 불러올 수 있고 show()로 호출은 불가능하다. 

Column<'count'>

In [29]:
df_pyspark.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'int')]

In [31]:
df_pyspark.describe().show() # 기초 통계량을 볼 수 있다.

+-------+-----------------+-------------------+------------------+
|summary|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|             count|
+-------+-----------------+-------------------+------------------+
|  count|              255|                255|               255|
|   mean|             null|               null| 1655.956862745098|
| stddev|             null|               null|21801.481975969557|
|    min|      Afghanistan|        Afghanistan|                 1|
|    max|          Vietnam|            Vietnam|            348113|
+-------+-----------------+-------------------+------------------+



In [34]:
# adding columns in data frame 
# 기본 값이 변경되지는 않는다.
# 새로운 컬럼을 추가하고 싶을 때 withColumn(name of new columns, condition )
df_pyspark.withColumn('count + 2', df_pyspark['count'] + 2).show(5) # (열 이름 , 기존 열이름)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count + 2|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|    1|        3|
|    United States|            Ireland|  264|      266|
|    United States|              India|   69|       71|
|            Egypt|      United States|   24|       26|
|Equatorial Guinea|      United States|    1|        3|
+-----------------+-------------------+-----+---------+
only showing top 5 rows



In [36]:
df_pyspark.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [39]:
### Drop the columns
# drop을 통해 column을 삭제할 수 있으며, 삭제한 결과를 유지하기 위해선 다른 변수에 할당해주어야 한다. 
df_pyspark.drop('count').show(3)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Ireland|
|    United States|              India|
+-----------------+-------------------+
only showing top 3 rows



In [41]:
df_pyspark.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [44]:
### Rename the columns
# 마찬가지로 다른 변수에 할당하여야 값이 변화한다. 
df_pyspark.withColumnRenamed('count', 'COUNT').show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|COUNT|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [45]:
df_pyspark.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows

