### Installing Necessary Packages

In [1]:
!pip install pyspark
!pip install graphframes

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=5b7edb855ee00e98152a83b5590ddb4b5162b21f5b01baa3e26c81a9218176b3
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.3 MB/s[0m eta [

### Importing Necessary Packages

In [2]:
import os
import pyspark.sql.functions as F
from graphframes import GraphFrame
from pyspark.sql import SparkSession,DataFrame
from tqdm.notebook import tqdm

### Mount the drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Global

In [4]:
DATA_DIR = "/content/drive/MyDrive/bdt/TP-6/bike-data"
STATION_DATA_PATH = os.path.join(DATA_DIR, "station_data.csv")
TRIP_DATA_PATH = os.path.join(DATA_DIR, "trip_data.csv")

### Question 01 : dataframes creation

In [5]:
spark  = SparkSession.builder.master("local[*]").config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12").getOrCreate()

In [6]:
stations = spark.read.csv(STATION_DATA_PATH, header=True, inferSchema=True)
trips = spark.read.csv(TRIP_DATA_PATH, header=True, inferSchema=True)

In [7]:
stations.show(5)

+----------+--------------------+---------+-----------+---------+--------+------------+
|station_id|                name|      lat|       long|dockcount|landmark|installation|
+----------+--------------------+---------+-----------+---------+--------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|San Jose|    8/5/2013|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|San Jose|    8/6/2013|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|San Jose|    8/5/2013|
|         6|    San Pedro Square|37.336721|-121.894074|       15|San Jose|    8/7/2013|
+----------+--------------------+---------+-----------+---------+--------+------------+
only showing top 5 rows



In [8]:
trips.show(5)

+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
|Trip ID|Duration|     Start Date|       Start Station|Start Terminal|       End Date|         End Station|End Terminal|Bike #|Subscriber Type|Zip Code|
+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
| 913460|     765|8/31/2015 23:26|Harry Bridges Pla...|            50|8/31/2015 23:39|San Francisco Cal...|          70|   288|     Subscriber|    2139|
| 913459|    1036|8/31/2015 23:11|San Antonio Shopp...|            31|8/31/2015 23:28|Mountain View Cit...|          27|    35|     Subscriber|   95032|
| 913455|     307|8/31/2015 23:13|      Post at Kearny|            47|8/31/2015 23:18|   2nd at South Park|          64|   468|     Subscriber|   94107|
| 913454|     409|8/31/2015 23:10|  San Jose City Hall|            10|8/31/2015 23

### Question 02 : Rename the column `name` to `id`

In [9]:
stations = stations.withColumnRenamed("name", "id")

In [10]:
print(stations.columns)

['station_id', 'id', 'lat', 'long', 'dockcount', 'landmark', 'installation']


### Question 03 : Change the names of the columns `Start Station` and `End Station` to `src` and `dst` respectively.

In [11]:
trips = trips \
  .withColumnRenamed("Start Station", "src") \
  .withColumnRenamed("End Station", "dst")

In [12]:
print(trips.columns)

['Trip ID', 'Duration', 'Start Date', 'src', 'Start Terminal', 'End Date', 'dst', 'End Terminal', 'Bike #', 'Subscriber Type', 'Zip Code']


### Question 04 : Graph creation

In [13]:
graph = GraphFrame(stations, trips)



### Question 05 : The number of trips for each source and destination in descending order

In [14]:
result = graph.edges.groupBy("src", "dst").count().orderBy(F.desc("count"))
result.show(5)

+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|San Francisco Cal...|     Townsend at 7th|    4|
|       5th at Howard|San Francisco Cal...|    3|
|     2nd at Townsend|   Market at Sansome|    2|
|   Steuart at Market|San Francisco Cal...|    2|
|     Spear at Folsom|     2nd at Townsend|    2|
+--------------------+--------------------+-----+
only showing top 5 rows



### Question 06 : The number trip where the source or destination is `Townsend at 7th` in descending order

In [15]:
graph.edges \
  .filter('src = "Townsend at 7th" or dst == "Townsend at 7th"') \
  .groupBy("src", "dst") \
  .count() \
  .orderBy(F.desc("count")).show(n=5)

+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|San Francisco Cal...|     Townsend at 7th|    4|
|       5th at Howard|     Townsend at 7th|    1|
|     Townsend at 7th|     Spear at Folsom|    1|
|     Townsend at 7th|Harry Bridges Pla...|    1|
|     Spear at Folsom|     Townsend at 7th|    1|
+--------------------+--------------------+-----+
only showing top 5 rows



### Question 07 : Return the stations that were never a destination to a trip that start at `Spear at Folsom`.

In [40]:
graph.edges.select("src").union(
    graph.edges.select("dst")
).distinct().subtract(graph.edges.filter("src = 'Spear at Folsom'").select('dst').distinct()).show(5)

+--------------------+
|                 src|
+--------------------+
| Golden Gate at Polk|
|Yerba Buena Cente...|
|   Market at Sansome|
|     Spear at Folsom|
|Commercial at Mon...|
+--------------------+
only showing top 5 rows



### Question 08 : the station with the most inDegrees

In [20]:
graph.inDegrees.orderBy(F.desc("inDegree")).limit(1).show()

+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|San Francisco Cal...|       9|
+--------------------+--------+



### Question 09 : Return the trip with the longest duration

In [21]:
graph.edges.orderBy('Duration').limit(1).show()

+-------+--------+---------------+---------------+--------------+---------------+--------------------+------------+------+---------------+--------+
|Trip ID|Duration|     Start Date|            src|Start Terminal|       End Date|                 dst|End Terminal|Bike #|Subscriber Type|Zip Code|
+-------+--------+---------------+---------------+--------------+---------------+--------------------+------------+------+---------------+--------+
| 913449|     126|8/31/2015 22:12|Beale at Market|            56|8/31/2015 22:15|Temporary Transba...|          55|   439|     Subscriber|   94130|
+-------+--------+---------------+---------------+--------------+---------------+--------------------+------------+------+---------------+--------+



### Question 10 : Create a subgraph with the trips that starts or end at `Townsend at 7th`

In [22]:
edges = graph.edges.where('src = "Townsend at 7th" or dst = "Townsend at 7th"')
vertices = graph.vertices

In [23]:
sub_graph = GraphFrame(vertices, edges)



### Question 11 :

In [46]:
graph.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)").show(5)

+--------------------+--------------------+--------------------+
|                   a|                   b|                   c|
+--------------------+--------------------+--------------------+
|{49, Spear at Fol...|{69, San Francisc...|{61, 2nd at Towns...|
|{49, Spear at Fol...|{69, San Francisc...|{65, Townsend at ...|
|{49, Spear at Fol...|{69, San Francisc...|{64, 2nd at South...|
|{49, Spear at Fol...|{69, San Francisc...|{64, 2nd at South...|
|{49, Spear at Fol...|{69, San Francisc...|{65, Townsend at ...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



### Question 12 : get the paths that pass throught tow stations and starts at `Townsend at 7th`.

In [56]:
graph.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(b); (c)-[]->(d)").filter("a.id = 'Townsend at 7th'").show(5)



+--------------------+--------------------+--------------------+--------------------+
|                   a|                   b|                   c|                   d|
+--------------------+--------------------+--------------------+--------------------+
|{65, Townsend at ...|{49, Spear at Fol...|{61, 2nd at Towns...|{49, Spear at Fol...|
|{65, Townsend at ...|{49, Spear at Fol...|{61, 2nd at Towns...|{77, Market at Sa...|
|{65, Townsend at ...|{49, Spear at Fol...|{61, 2nd at Towns...|{63, Howard at 2n...|
|{65, Townsend at ...|{49, Spear at Fol...|{61, 2nd at Towns...|{77, Market at Sa...|
|{65, Townsend at ...|{49, Spear at Fol...|{61, 2nd at Towns...|{74, Steuart at M...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

