# GraphX and GraphFrames

In [2]:
#importing the libraries
from pyspark import SparkContext, SQLContext
from graphframes import *
sc = SparkContext.getOrCreate()
sc.addPyFile('C:\spark-2.4.4-bin-hadoop2.7\jars\graphframes-0.7.0-spark2.4-s_2.11.jar')
sqlcontext=SQLContext(sc)

##  1.	Import the dataset as a csv file and create data frames directly on import than create graph out of the data frame created. 

In [3]:
StationDataset = sqlcontext.read.format("csv").option("header", "true").csv('D:/Datasets/201508_station_data.csv').withColumnRenamed("name","id").select("id").distinct()
TripDataset = sqlcontext.read.format("csv").option("header", "true").csv('D:/Datasets/201508_trip_data.csv').withColumnRenamed("Start Station","src").withColumnRenamed("End Station","dst").select("src","dst").groupBy("src","dst").count()

## Creating the graph

In [5]:
Data = GraphFrame(StationDataset, TripDataset)

In [11]:
print(Data)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 1 more field])


## 2. Triangle Count

In [7]:
Data.triangleCount().show()

+-----+--------------------+
|count|                  id|
+-----+--------------------+
|  496|       2nd at Folsom|
|   23|California Ave Ca...|
|    0|Washington at Kea...|
|  496|Powell at Post (U...|
|  496| Golden Gate at Polk|
|  496|Yerba Buena Cente...|
|  496|   Market at Sansome|
|   90|         MLK Library|
|  496|     Spear at Folsom|
|   77|           Japantown|
|  496|Commercial at Mon...|
|   81|Paseo de San Antonio|
|   23|Rengstorff Avenue...|
|   61| San Salvador at 1st|
|  496|     Townsend at 7th|
|  496|Civic Center BART...|
|   41|         Ryland Park|
|   90|San Jose Diridon ...|
|   63|San Jose Civic Ce...|
|    0|     Post at Kearney|
+-----+--------------------+
only showing top 20 rows



## 3.	Find Shortest Paths w.r.t. Landmarks 

In [10]:
Data.shortestPaths(landmarks=["Japantown", "MLK Library"]).show()

+--------------------+--------------------+
|                  id|           distances|
+--------------------+--------------------+
|       2nd at Folsom|                  []|
|      Market at 10th|                  []|
|California Ave Ca...|                  []|
|Washington at Kea...|                  []|
|Redwood City Publ...|                  []|
|Powell at Post (U...|                  []|
| Golden Gate at Polk|                  []|
|    Adobe on Almaden|[MLK Library -> 1...|
|Broadway St at Ba...|                  []|
|Yerba Buena Cente...|                  []|
|     Beale at Market|                  []|
|   Market at Sansome|                  []|
|         MLK Library|[MLK Library -> 0...|
|     Spear at Folsom|                  []|
|       5th at Howard|                  []|
|           Japantown|[Japantown -> 0, ...|
|Commercial at Mon...|                  []|
|    San Pedro Square|[Japantown -> 1, ...|
|Paseo de San Antonio|[Japantown -> 1, ...|
|Redwood City Medi...|          

## 4.	Apply Page Rank algorithm on the dataset.

In [7]:
results = Data.pageRank(resetProbability=0.15, tol=0.01)

In [8]:
results.vertices.select("id", "pagerank").show()

+--------------------+-------------------+
|                  id|           pagerank|
+--------------------+-------------------+
|       2nd at Folsom| 0.9824888917828896|
|      Market at 10th| 1.0381304111380782|
|California Ave Ca...|  1.117214156049386|
|Washington at Kea...|0.16391984022625664|
|Redwood City Publ...| 0.6728086774388868|
|Powell at Post (U...| 1.0381304111380778|
| Golden Gate at Polk|  1.038130411138078|
|    Adobe on Almaden| 0.9037422230312736|
|Broadway St at Ba...| 1.0115490961700715|
|Yerba Buena Cente...| 1.0381304111380782|
|     Beale at Market| 1.0381304111380782|
|   Market at Sansome| 1.0381304111380782|
|         MLK Library| 1.0850875307932697|
|     Spear at Folsom| 1.0381304111380782|
|       5th at Howard| 1.0381304111380782|
|           Japantown| 1.1277529076126747|
|Commercial at Mon...| 1.0381304111380782|
|    San Pedro Square| 1.2625573035092683|
|Paseo de San Antonio| 1.1971448900978934|
|Redwood City Medi...| 0.4022808574359379|
+----------

In [9]:
results.edges.select("src", "dst", "weight").show()

+--------------------+--------------------+--------------------+
|                 src|                 dst|              weight|
+--------------------+--------------------+--------------------+
|       2nd at Folsom|San Francisco Cal...|0.030303030303030304|
|   Market at Sansome|       Market at 4th|0.030303030303030304|
|     Spear at Folsom|    Davis at Jackson|0.030303030303030304|
|Commercial at Mon...|Embarcadero at Br...|0.030303030303030304|
|     Townsend at 7th|San Francisco Cal...|0.030303030303030304|
|          Mezes Park|          Mezes Park|                0.25|
|       5th at Howard|Grant Avenue at C...|0.030303030303030304|
|San Francisco Cal...|Embarcadero at Sa...|0.030303030303030304|
|     Townsend at 7th|       Howard at 2nd|0.030303030303030304|
|SJSU - San Salvad...|SJSU - San Salvad...| 0.07692307692307693|
|Embarcadero at Sa...|Temporary Transba...|0.030303030303030304|
|San Francisco Cal...|  Powell Street BART|0.030303030303030304|
|   2nd at South Park|   

## 5.	Save graphs generated to a file.

In [1]:
Data.vertices.write.parquet('vertices')
Data.edges.write.parquet('edges')

NameError: name 'Data' is not defined

# Bonus:
## 1.	Apply Label Propagation Algorithm

In [5]:
result = Data.labelPropagation(maxIter=5)
result.select("id", "label").show()

+--------------------+-------------+
|                  id|        label|
+--------------------+-------------+
|       2nd at Folsom|            0|
|      Market at 10th|            0|
|California Ave Ca...|1649267441664|
|Washington at Kea...|  17179869184|
|Redwood City Publ...| 730144440320|
|Powell at Post (U...|            0|
| Golden Gate at Polk|            0|
|    Adobe on Almaden| 257698037761|
|Broadway St at Ba...|            0|
|Yerba Buena Cente...|            0|
|     Beale at Market|            0|
|   Market at Sansome|            0|
|         MLK Library| 257698037761|
|     Spear at Folsom|            0|
|       5th at Howard|            0|
|           Japantown| 257698037761|
|Commercial at Mon...|            0|
|    San Pedro Square| 257698037761|
|Paseo de San Antonio| 257698037761|
|Redwood City Medi...| 730144440320|
+--------------------+-------------+
only showing top 20 rows



## 2.	Apply BFS algorithm

In [6]:
paths = Data.bfs("id = 'Spear at Folsom'", "id = 'Golden Gate at Polk'")
paths.show()

+-----------------+--------------------+--------------------+
|             from|                  e0|                  to|
+-----------------+--------------------+--------------------+
|[Spear at Folsom]|[Spear at Folsom,...|[Golden Gate at P...|
+-----------------+--------------------+--------------------+

