In [1]:
import pyspark
import pandas as pd
import numpy as np
spark = pyspark.sql.SparkSession.builder.getOrCreate()
from pyspark.sql.functions import *
from pyspark.sql.functions import lit
from pyspark.sql.functions import regexp_extract, regexp_replace
import re

#### Remember: you have to 'register' a table to query it with spark.sql
- mpg.createOrReplaceTempView("mpg")

In [2]:
pandas_df = pd.read_csv('activities.csv')

In [3]:
pandas_df.head()

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Activity Description,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Commute,...,UV Index,Weather Ozone,"<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.jump_count"">Jump Count</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.total_grit"">Total Grit</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_flow"">Avg Flow</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.flagged"">Flagged</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_elapsed_speed"">Avg Elapsed Speed</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.dirt_distance"">Dirt Distance</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.newly_explored_distance"">Newly Explored Distance</span>","<span class=""translation_missing"" title=""translation missing: en-US.lib.export.portability_exporter.activities.horton_values.newly_explored_dirt_distance"">Newly Explored Dirt Distance</span>"
0,350128633,"Jul 20, 2015, 1:06:57 PM",Renecca creek rd,Ride,,2352,13.48,,,False,...,,,,,,,,,,
1,623600149,"Oct 11, 2015, 10:03:14 PM",Afternoon Ride,Ride,,5495,23.61,,,False,...,,,,,,,,,,
2,797001969,"Dec 8, 2016, 4:28:12 PM",Morning Ride,Ride,,8,0.03,,,False,...,,,,,,,,,,
3,800888948,"Dec 12, 2016, 10:22:44 PM",Afternoon Ride,Ride,,7942,10.22,149.0,8.0,False,...,,,,,,,,,,
4,811340614,"Dec 25, 2016, 10:37:32 PM",Afternoon Ride,Ride,,4983,26.16,162.0,32.0,False,...,,,,,,,,,,


In [4]:
pandas_df['Activity Type'].value_counts()

Ride    346
Run     165
Hike     10
Name: Activity Type, dtype: int64

In [5]:
pandas_df.columns.to_list()

['Activity ID',
 'Activity Date',
 'Activity Name',
 'Activity Type',
 'Activity Description',
 'Elapsed Time',
 'Distance',
 'Max Heart Rate',
 'Relative Effort',
 'Commute',
 'Activity Gear',
 'Filename',
 'Athlete Weight',
 'Bike Weight',
 'Elapsed Time.1',
 'Moving Time',
 'Distance.1',
 'Max Speed',
 'Average Speed',
 'Elevation Gain',
 'Elevation Loss',
 'Elevation Low',
 'Elevation High',
 'Max Grade',
 'Average Grade',
 'Average Positive Grade',
 'Average Negative Grade',
 'Max Cadence',
 'Average Cadence',
 'Max Heart Rate.1',
 'Average Heart Rate',
 'Max Watts',
 'Average Watts',
 'Calories',
 'Max Temperature',
 'Average Temperature',
 'Relative Effort.1',
 'Total Work',
 'Number of Runs',
 'Uphill Time',
 'Downhill Time',
 'Other Time',
 'Perceived Exertion',
 '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.type">Type</span>',
 '<span class="translation_missing" title="translation missing: en-US.l

In [6]:
cols = ['Activity ID',
 'Activity Date',
 'Activity Name',
 'Activity Type',
 'Elapsed Time',
 'Distance',
 'Elapsed Time.1',
 'Moving Time',
 'Distance.1',
 'Max Speed',
 'Average Speed',
 'Elevation Gain',
 'Elevation Loss',
 'Elevation Low',
 'Elevation High',
 'Max Grade',
 'Average Grade',
 'Average Watts',
 'Calories']

In [7]:
pandas_df[cols].isna().sum()

Activity ID         0
Activity Date       0
Activity Name       0
Activity Type       0
Elapsed Time        0
Distance            0
Elapsed Time.1      5
Moving Time         0
Distance.1          0
Max Speed           0
Average Speed     148
Elevation Gain      4
Elevation Loss    155
Elevation Low       0
Elevation High      0
Max Grade           0
Average Grade       0
Average Watts     176
Calories           38
dtype: int64

In [8]:
activities = pandas_df[cols]

In [9]:
activities = activities.rename(columns={'Activity ID':'Activity_ID',
 'Activity Date': 'Activity_date',
 'Activity Name': 'Activity_name',
 'Activity Type': 'Activity_Type',
 'Elapsed Time': 'Elapsed_Time_Secs',
 'Elapsed Time.1':'Elapsed_Time_1',
 'Distance': 'Distance_KM',
 'Moving Time': 'Moving_Time_Secs',
 'Distance.1': 'Distance_1',
 'Max Speed':'Max_Speed',
 'Average Speed':'Average_Speed',
 'Elevation Gain': 'Elevation_Gain',
 'Elevation Loss':'Elevation_Loss',
 'Elevation Low':'Elevation_Low',
 'Elevation High':'Elevation_High',
 'Max Grade':'Max_Grade',
 'Average Grade':'Average_Grade',
 'Average Watts':'Average_Watts'})

In [10]:
activities.head().T

Unnamed: 0,0,1,2,3,4
Activity_ID,350128633,623600149,797001969,800888948,811340614
Activity_date,"Jul 20, 2015, 1:06:57 PM","Oct 11, 2015, 10:03:14 PM","Dec 8, 2016, 4:28:12 PM","Dec 12, 2016, 10:22:44 PM","Dec 25, 2016, 10:37:32 PM"
Activity_name,Renecca creek rd,Afternoon Ride,Morning Ride,Afternoon Ride,Afternoon Ride
Activity_Type,Ride,Ride,Ride,Ride,Ride
Elapsed_Time_Secs,2352,5495,8,7942,4983
Distance_KM,13.48,23.61,0.03,10.22,26.16
Elapsed_Time_1,2352.0,5495.0,8.0,7942.0,4983.0
Moving_Time_Secs,2272.0,3965.0,8.0,1851.0,4424.0
Distance_1,13482.599609,23613.199219,30.6,10228.599609,26168.800781
Max_Speed,16.0,15.1,0.2,14.5,15.1


In [11]:
activities.to_csv('activities2.csv')

In [12]:
# df = spark.createDataFrame(pandas_df)


# the usual method is throwing errors related to data types
# some columns have multiple dtypes (NaN, values, etc); giving pyspark issues

In [13]:
df = (spark.read.format("csv").options(header="true")
    .load("activities2.csv"))

In [14]:
df.show(5, vertical=True)

-RECORD 0---------------------------------
 _c0               | 0                    
 Activity_ID       | 350128633            
 Activity_date     | Jul 20, 2015, 1:0... 
 Activity_name     | Renecca creek rd     
 Activity_Type     | Ride                 
 Elapsed_Time_Secs | 2352                 
 Distance_KM       | 13.48                
 Elapsed_Time_1    | 2352.0               
 Moving_Time_Secs  | 2272.0               
 Distance_1        | 13482.599609375      
 Max_Speed         | 16.0                 
 Average_Speed     | null                 
 Elevation_Gain    | 187.07899475097656   
 Elevation_Loss    | null                 
 Elevation_Low     | 283.79998779296875   
 Elevation_High    | 406.8999938964844    
 Max_Grade         | 16.799999237060547   
 Average_Grade     | 0.7268630266189575   
 Average_Watts     | null                 
 Calories          | null                 
-RECORD 1---------------------------------
 _c0               | 1                    
 Activity_I

In [15]:
print('DataFrame shape', df.count(), ' x ', len(df.columns))

DataFrame shape 521  x  20


In [16]:
# df.describe().show(vertical = True)

## Note the registration of the df for spark.sql use

In [17]:
df.createOrReplaceTempView("df")

In [18]:
spark.sql(
    """
SELECT Distance_KM, Elapsed_Time_Secs, Distance_KM/(Moving_Time_Secs / 3600) AS Avg_Speed_km_hr
FROM df
"""
).show(5)

+-----------+-----------------+------------------+
|Distance_KM|Elapsed_Time_Secs|   Avg_Speed_km_hr|
+-----------+-----------------+------------------+
|      13.48|             2352|21.359154929577468|
|      23.61|             5495|21.436569987389657|
|       0.03|                8|              13.5|
|      10.22|             7942| 19.87682333873582|
|      26.16|             4983|  21.2875226039783|
+-----------+-----------------+------------------+
only showing top 5 rows



- Strava has some null values in the avrage speepd attribute, which it shouldn't, so I will go back and add as a feature using pyspark

In [19]:
df.columns

['_c0',
 'Activity_ID',
 'Activity_date',
 'Activity_name',
 'Activity_Type',
 'Elapsed_Time_Secs',
 'Distance_KM',
 'Elapsed_Time_1',
 'Moving_Time_Secs',
 'Distance_1',
 'Max_Speed',
 'Average_Speed',
 'Elevation_Gain',
 'Elevation_Loss',
 'Elevation_Low',
 'Elevation_High',
 'Max_Grade',
 'Average_Grade',
 'Average_Watts',
 'Calories']

In [20]:
df.select(concat(df.Activity_Type,lit(' '), df.Activity_date)).show(5)

+---------------------------------------+
|concat(Activity_Type,  , Activity_date)|
+---------------------------------------+
|                   Ride Jul 20, 2015...|
|                   Ride Oct 11, 2015...|
|                   Ride Dec 8, 2016,...|
|                   Ride Dec 12, 2016...|
|                   Ride Dec 25, 2016...|
+---------------------------------------+
only showing top 5 rows



In [21]:
df.select(df.Activity_date).show(5, truncate = False)

+-------------------------+
|Activity_date            |
+-------------------------+
|Jul 20, 2015, 1:06:57 PM |
|Oct 11, 2015, 10:03:14 PM|
|Dec 8, 2016, 4:28:12 PM  |
|Dec 12, 2016, 10:22:44 PM|
|Dec 25, 2016, 10:37:32 PM|
+-------------------------+
only showing top 5 rows



In [23]:
regexp_extract(col('Notes'), '(.)(by)(\s+)(\w+)', 4))
'.by\s+(\w+)', 1
r"^(\w{3})(\s)(,)(\d{4})",1

SyntaxError: unmatched ')' (<ipython-input-23-c3b54f6466ff>, line 1)

In [None]:
df.select(
    "Activity_Type",
    regexp_extract("Activity_Date", r"^(\w{3})", 1).alias("Month"),
    regexp_extract("Activity_Date", r"^\w{3}\s\d{1,2}(,)\s(.{4})",2).alias("Year"),
).show(5,truncate=False)

- ok, you got the year.  why is it group 2 though? regex.

In [24]:
# the answer to my question on stackoverflow : )


from pyspark.sql import functions as F

df1 = (df.withColumn("Activity_date", F.to_timestamp("Activity_date", "MMM d, yyyy, h:mm:ss a"))
       .withColumn("Month", F.date_format("Activity_date", "MMM"))
       .withColumn("Year", F.year("Activity_date"))
       )

df1.show(vertical=True)

-RECORD 0--------------------------------
 _c0               | 0                   
 Activity_ID       | 350128633           
 Activity_date     | 2015-07-20 13:06:57 
 Activity_name     | Renecca creek rd    
 Activity_Type     | Ride                
 Elapsed_Time_Secs | 2352                
 Distance_KM       | 13.48               
 Elapsed_Time_1    | 2352.0              
 Moving_Time_Secs  | 2272.0              
 Distance_1        | 13482.599609375     
 Max_Speed         | 16.0                
 Average_Speed     | null                
 Elevation_Gain    | 187.07899475097656  
 Elevation_Loss    | null                
 Elevation_Low     | 283.79998779296875  
 Elevation_High    | 406.8999938964844   
 Max_Grade         | 16.799999237060547  
 Average_Grade     | 0.7268630266189575  
 Average_Watts     | null                
 Calories          | null                
 Month             | Jul                 
 Year              | 2015                
-RECORD 1-------------------------

In [25]:
example = "Jul 20, 2015, 1:06:57 PM"

In [26]:
re.search(r"^\w{3}", example)

<re.Match object; span=(0, 3), match='Jul'>

In [27]:
re.search(r"^\w{3}\s\d{1,2}", example)

<re.Match object; span=(0, 6), match='Jul 20'>

In [28]:
re.search(r"^\w{3}\s\d{1,2}(,)", example)

<re.Match object; span=(0, 7), match='Jul 20,'>

In [29]:
re.search(r"^\w{3}\s\d{1,2}(,)\s(.{4})", example)

<re.Match object; span=(0, 12), match='Jul 20, 2015'>

In [30]:
df.filter(df.Distance_KM > 80).show(5, vertical=True)

-RECORD 0---------------------------------
 _c0               | 176                  
 Activity_ID       | 3753411577           
 Activity_date     | Jul 12, 2020, 3:0... 
 Activity_name     | Morning Ride         
 Activity_Type     | Ride                 
 Elapsed_Time_Secs | 16873                
 Distance_KM       | 82.52                
 Elapsed_Time_1    | 16873.0              
 Moving_Time_Secs  | 13586.0              
 Distance_1        | 82523.8984375        
 Max_Speed         | 13.600000381469728   
 Average_Speed     | 6.0741868019104      
 Elevation_Gain    | 505.89007568359375   
 Elevation_Loss    | 505.8900146484375    
 Elevation_Low     | 200.0                
 Elevation_High    | 319.20001220703125   
 Max_Grade         | 22.600000381469727   
 Average_Grade     | 0.0                  
 Average_Watts     | 146.6240692138672    
 Calories          | 2221.117431640625    
-RECORD 1---------------------------------
 _c0               | 194                  
 Activity_I

In [31]:
df.filter(df.Distance_KM < 10).where(df.Activity_Type == 'Ride').show(5, vertical=True)

-RECORD 0---------------------------------
 _c0               | 2                    
 Activity_ID       | 797001969            
 Activity_date     | Dec 8, 2016, 4:28... 
 Activity_name     | Morning Ride         
 Activity_Type     | Ride                 
 Elapsed_Time_Secs | 8                    
 Distance_KM       | 0.03                 
 Elapsed_Time_1    | 8.0                  
 Moving_Time_Secs  | 8.0                  
 Distance_1        | 30.600000381469727   
 Max_Speed         | 0.2000000029802322   
 Average_Speed     | null                 
 Elevation_Gain    | 0.0                  
 Elevation_Loss    | null                 
 Elevation_Low     | 207.8999938964844    
 Elevation_High    | 207.8999938964844    
 Max_Grade         | 0.0                  
 Average_Grade     | 0.0                  
 Average_Watts     | 0.0                  
 Calories          | null                 
-RECORD 1---------------------------------
 _c0               | 6                    
 Activity_I

In [32]:
df.select(df.Distance_KM,when(df.Distance_KM > 80, 'Long Ride')
          .otherwise("Less Long")
          .alias('Ride_Length')
         ).where(df.Distance_KM > 50).show(20)
#I added a where statement to narrow the results a little and actually see 
# some long rides.
# There are fewer than 20 total in the filter

+-----------+-----------+
|Distance_KM|Ride_Length|
+-----------+-----------+
|      56.76|  Less Long|
|      76.13|  Less Long|
|      52.48|  Less Long|
|      82.52|  Long Ride|
|      97.52|  Long Ride|
|      84.81|  Long Ride|
|      119.4|  Long Ride|
|      57.08|  Less Long|
|     161.35|  Long Ride|
|      79.74|  Less Long|
|      51.85|  Less Long|
|      57.85|  Less Long|
|      54.51|  Less Long|
|      58.85|  Less Long|
+-----------+-----------+



In [33]:
df.select(df.Distance_KM, 
         (when(df.Distance_KM<10,'very short')
         .when(df.Distance_KM<25,'short')
         .when(df.Distance_KM<50,'medium')
         .when(df.Distance_KM<80,'long')
         .otherwise('very long')
         .alias('Ride_Length'))
         ).where(df.Activity_Type=='Ride').show(50)

+-----------+-----------+
|Distance_KM|Ride_Length|
+-----------+-----------+
|      13.48|      short|
|      23.61|      short|
|       0.03| very short|
|      10.22|      short|
|      26.16|     medium|
|      12.25|      short|
|       9.82| very short|
|      37.42|     medium|
|      47.55|     medium|
|       5.49| very short|
|       3.69| very short|
|      28.42|     medium|
|      18.62|      short|
|      33.63|     medium|
|      16.09|      short|
|      48.86|     medium|
|      16.74|      short|
|      46.25|     medium|
|        6.4| very short|
|      18.31|      short|
|      50.69|       long|
|      56.76|       long|
|      35.21|     medium|
|      32.32|     medium|
|      11.97|      short|
|      37.74|     medium|
|      76.13|       long|
|       36.8|     medium|
|       5.07| very short|
|      30.49|     medium|
|      46.69|     medium|
|      24.74|      short|
|       5.41| very short|
|      35.51|     medium|
|      11.34|      short|
|      17.63

In [34]:
df.columns

['_c0',
 'Activity_ID',
 'Activity_date',
 'Activity_name',
 'Activity_Type',
 'Elapsed_Time_Secs',
 'Distance_KM',
 'Elapsed_Time_1',
 'Moving_Time_Secs',
 'Distance_1',
 'Max_Speed',
 'Average_Speed',
 'Elevation_Gain',
 'Elevation_Loss',
 'Elevation_Low',
 'Elevation_High',
 'Max_Grade',
 'Average_Grade',
 'Average_Watts',
 'Calories']

In [35]:
df.select(df.Distance_KM, df.Elapsed_Time_Secs).withColumn("Avg_Speed_KM_per_Hr", \
                    col('Distance_KM') / (col('Elapsed_Time_Secs')/3600)).show()

+-----------+-----------------+-------------------+
|Distance_KM|Elapsed_Time_Secs|Avg_Speed_KM_per_Hr|
+-----------+-----------------+-------------------+
|      13.48|             2352|  20.63265306122449|
|      23.61|             5495| 15.467879890809828|
|       0.03|                8|               13.5|
|      10.22|             7942|  4.632586250314783|
|      26.16|             4983|   18.8994581577363|
|      12.25|            11270| 3.9130434782608696|
|       9.82|             8235|  4.292896174863388|
|      37.42|             9322| 14.450976185367946|
|      47.55|            14725| 11.625127334465194|
|       5.49|             2365|  8.356871035940804|
|       3.69|             1272| 10.443396226415095|
|      28.42|             6665| 15.350637659414854|
|      18.62|           267771|0.25033330719159286|
|      33.63|            17863|  6.777584952135699|
|      16.09|            15660|  3.698850574712644|
|      48.86|            15832| 11.110156644770086|
|      16.74

In [36]:
df.select(df.Distance_KM, df.Elapsed_Time_Secs).withColumn("Avg_Speed_KM_per_Hr", \
col('Distance_KM') / (col('Elapsed_Time_Secs')/3600)).sort(desc('Avg_Speed_KM_per_Hr'))\
.show()

+-----------+-----------------+-------------------+
|Distance_KM|Elapsed_Time_Secs|Avg_Speed_KM_per_Hr|
+-----------+-----------------+-------------------+
|      46.26|             5975| 27.872133891213387|
|      40.51|             5333|  27.34595912244515|
|      39.02|             5182| 27.107680432265536|
|      44.31|             6179| 25.815827803851757|
|      25.47|             3630| 25.259504132231406|
|      38.49|             5555| 24.944014401440143|
|      23.69|             3464|  24.62009237875289|
|      26.28|             3873| 24.427575522850503|
|      28.78|             4252|  24.36688617121355|
|       6.27|              934| 24.167023554603855|
|      26.02|             3892| 24.067831449126412|
|      28.62|             4285|  24.04480746791132|
|      32.41|             4878|  23.91881918819188|
|       7.32|             1104| 23.869565217391308|
|       38.9|             5914| 23.679404802164356|
|      39.35|             5985| 23.669172932330827|
|      25.88

- Great use of sort()

In [37]:
df.select(
    df.Distance_KM, 
         (
             when(df.Distance_KM<10,'very short')
         .when(df.Distance_KM<25,'short')
         .when(df.Distance_KM<50,'medium')
         .when(df.Distance_KM<80,'long')
         .otherwise('very long')
         .alias('Ride_Length')
         )
    ,df.Elapsed_Time_Secs)\
    .withColumn("Avg_Speed_KM_per_Hr",
                col('Distance_KM') / (col('Elapsed_Time_Secs')/3600))\
    .sort(asc('Ride_Length'))\
    .sort(desc('Avg_Speed_KM_per_Hr'))\
    .where(df.Activity_Type=='Ride').show(20)

+-----------+-----------+-----------------+-------------------+
|Distance_KM|Ride_Length|Elapsed_Time_Secs|Avg_Speed_KM_per_Hr|
+-----------+-----------+-----------------+-------------------+
|      46.26|     medium|             5975| 27.872133891213387|
|      40.51|     medium|             5333|  27.34595912244515|
|      39.02|     medium|             5182| 27.107680432265536|
|      44.31|     medium|             6179| 25.815827803851757|
|      25.47|     medium|             3630| 25.259504132231406|
|      38.49|     medium|             5555| 24.944014401440143|
|      23.69|      short|             3464|  24.62009237875289|
|      26.28|     medium|             3873| 24.427575522850503|
|      28.78|     medium|             4252|  24.36688617121355|
|       6.27| very short|              934| 24.167023554603855|
|      26.02|     medium|             3892| 24.067831449126412|
|      28.62|     medium|             4285|  24.04480746791132|
|      32.41|     medium|             48

- not sure it's possible to groupBy then sort
- maybe by using .agg()?

In [81]:
df.select(
    df.Distance_KM, 
         (
             when(df.Distance_KM<10,'very short')
         .when(df.Distance_KM<25,'short')
         .when(df.Distance_KM<50,'medium')
         .when(df.Distance_KM<80,'long')
         .otherwise('very long')
         .alias('Ride_Length')
         )
    ,df.Elapsed_Time_Secs)\
    .withColumn("Avg_Speed_KM_per_Hr",
                col('Distance_KM') / (col('Elapsed_Time_Secs')/3600))\
    .orderBy('Ride_Length',desc('Avg_Speed_KM_per_Hr'))\
    .where(col('Ride_Length')=='very long').show()

+-----------+-----------+-----------------+-------------------+
|Distance_KM|Ride_Length|Elapsed_Time_Secs|Avg_Speed_KM_per_Hr|
+-----------+-----------+-----------------+-------------------+
|     161.35|  very long|            28995| 20.033109156751163|
|      119.4|  very long|            23464| 18.319127173542448|
|      82.52|  very long|            16873|   17.6063533455817|
|      97.52|  very long|            20592|  17.04895104895105|
|      84.81|  very long|            18190| 16.784826827927432|
+-----------+-----------+-----------------+-------------------+



## Finally got it--remember to define the two arguments in the orderBy()

In [84]:
df.select(
    df.Distance_KM, 
         (
             when(df.Distance_KM<10,'very short')
         .when(df.Distance_KM<25,'short')
         .when(df.Distance_KM<50,'medium')
         .when(df.Distance_KM<80,'long')
         .otherwise('very long')
         .alias('Ride_Length')
         )
    ,df.Elapsed_Time_Secs)\
    .withColumn("Avg_Speed_KM_per_Hr",
                col('Distance_KM') / (col('Elapsed_Time_Secs')/3600))\
    .rollup('Ride_Length').count().sort('Ride_Length').show()

+-----------+-----+
|Ride_Length|count|
+-----------+-----+
|       null|  521|
|       long|   12|
|     medium|  122|
|      short|   89|
|  very long|    5|
| very short|  293|
+-----------+-----+



## `rollup` showing the count of ride lengths...where are the nulls coming from?
## oh, that's an aspect of .rollup() duh

In [89]:
df.select(
    df.Distance_KM, 
         (
             when(df.Distance_KM<10,'very short')
         .when(df.Distance_KM<25,'short')
         .when(df.Distance_KM<50,'medium')
         .when(df.Distance_KM<80,'long')
         .otherwise('very long')
         .alias('Ride_Length')
         )
    ,df.Elapsed_Time_Secs)\
    .withColumn("Avg_Speed_KM_per_Hr",
                col('Distance_KM') / (col('Elapsed_Time_Secs')/3600))\
             ,(
             when(col('Avg_Speed_KM_per_Hr')<10,'slow')
         .when(col('Avg_Speed_KM_per_Hr')<15,'medium')
         .otherwise('fast')
         .alias('Ride_Speed')
         ).where(df.Activity_Type=='Ride').show(20)

TypeError: 'Column' object is not callable