In [4]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [5]:
df = spark.createDataFrame(pandas_dataframe)

In [8]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [9]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [13]:
df.describe

<bound method DataFrame.describe of DataFrame[n: double, group: string, abool: boolean]>

In [24]:
df2 = df.select('n', 'abool')

In [25]:
df2.show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [26]:
df3 = df.select('group', 'abool')

In [27]:
df3.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [30]:
df4 = df3.select('group', 'abool').withColumnRenamed('abool','a_boolean_value')

In [31]:
df4.show(5)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
|    y|          false|
|    z|          false|
+-----+---------------+
only showing top 5 rows



In [33]:
df5 = df.select('group', 'n').withColumnRenamed('n','a_numeric_value')

In [34]:
df5.show(5)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
+-----+--------------------+
only showing top 5 rows



In [39]:
df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
+------------------+
only showing top 5 rows



In [40]:
df.select(df.n - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
+-------------------+
only showing top 5 rows



In [41]:
df.select(df.n * 2).show(5)

+--------------------+
|             (n * 2)|
+--------------------+
|  -1.424781324101176|
|   1.507532757319406|
|-0.08900615667610691|
|  0.9036246774915795|
|  2.6902034169020195|
+--------------------+
only showing top 5 rows



In [45]:
col = (df.n * -1)

In [46]:
df.select('n', col.alias('n2')).show(4)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
+--------------------+--------------------+
only showing top 4 rows



In [48]:
col2 = (df.n ** 2)
df.select('n', col.alias('n2'), col2.alias('n3')).show(4)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
+--------------------+--------------------+--------------------+
only showing top 4 rows



In [49]:
df.group + df.abool

Column<'(group + abool)'>

In [102]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#619 as double) + abool#620) AS (group + abool)#643]
+- LogicalRDD [n#618, group#619, abool#620], false


In [51]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [53]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [54]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [55]:
df.select(df.abool.cast('int')).show()


+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



In [56]:
df.select(df.abool.cast('int')).show(5)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [95]:
df.select(df.n.cast('int')).show()

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
|  1|
|  0|
|  1|
|  0|
|  1|
| -1|
|  0|
| -1|
|  0|
|  0|
|  0|
|  0|
|  0|
|  2|
|  0|
|  0|
+---+



In [58]:
df.select(df.abool.cast('string')).show()

+-----+
|abool|
+-----+
|false|
|false|
|false|
|false|
|false|
|false|
|false|
|false|
| true|
| true|
|false|
|false|
| true|
| true|
|false|
|false|
|false|
| true|
|false|
| true|
+-----+



In [79]:
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

In [73]:
df.orderBy(df.n, a).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
+--------------------+-----+-----+
only showing top 5 rows



In [75]:
df.agg({'n': 'max'}).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



In [76]:
df.agg({'n': 'min'}).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



In [77]:
df.agg({'n': 'mean'}).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



In [97]:
df.select(concat(lit('group: '), 'group')).show()

+----------------------+
|concat(group: , group)|
+----------------------+
|              group: z|
|              group: x|
|              group: z|
|              group: y|
|              group: z|
|              group: y|
|              group: z|
|              group: x|
|              group: z|
|              group: y|
|              group: x|
|              group: y|
|              group: y|
|              group: y|
|              group: y|
|              group: x|
|              group: z|
|              group: y|
|              group: x|
|              group: x|
+----------------------+



In [98]:
df.select(concat('group', lit(': '), 'n')).show()

+--------------------+
|concat(group, : , n)|
+--------------------+
|z: -0.71239066205...|
|x: 0.753766378659703|
|z: -0.04450307833...|
|y: 0.451812338745...|
|z: 1.345101708451...|
|y: 0.532337888294...|
|z: 1.350187899722...|
|x: 0.861211374169...|
|z: 1.478685737435...|
|y: -1.04537713053...|
|x: -0.78898902495...|
|y: -1.26160594531...|
|y: 0.562846785281...|
|y: -0.24332625188...|
|y: 0.913740704859...|
|x: 0.317350922736...|
|z: 0.127303280206...|
|y: 2.150382967381...|
|x: 0.606288656896...|
|x: -0.02677164998...|
+--------------------+



In [99]:
df = spark.createDataFrame(pandas_dataframe)

In [100]:
df.select('abool', when(df.abool == True, 'It is true').otherwise('It is false').alias('True or False')).show(25)

+-----+-------------+
|abool|True or False|
+-----+-------------+
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
| true|   It is true|
| true|   It is true|
|false|  It is false|
|false|  It is false|
| true|   It is true|
| true|   It is true|
|false|  It is false|
|false|  It is false|
|false|  It is false|
| true|   It is true|
|false|  It is false|
| true|   It is true|
+-----+-------------+



In [101]:
df.select(when(df.n < 0, 0).otherwise(df.n).alias('No Negatives')).show()

+-------------------+
|       No Negatives|
+-------------------+
|                0.0|
|  0.753766378659703|
|                0.0|
|0.45181233874578974|
| 1.3451017084510097|
| 0.5323378882945463|
| 1.3501878997225267|
| 0.8612113741693206|
| 1.4786857374358966|
|                0.0|
|                0.0|
|                0.0|
| 0.5628467852810314|
|                0.0|
| 0.9137407048596775|
|0.31735092273633597|
|0.12730328020698067|
| 2.1503829673811126|
| 0.6062886568962988|
|                0.0|
+-------------------+

