## Import needed libraries and contexts

In [232]:
import findspark
findspark.init("C:\Spark") # type" findspark.init() OR specify its exact location

import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import col, expr, when

# initialize all the data

#sc = pyspark.SparkContext(appName="myapp") # sparkContext
sqlContext = SQLContext(sc) # sqlContext


# load data from csv using both Pandas & Spark (SparkSQL)
in_file = 'titanic_data.csv'



## Counting in Pandas and Spark

In [233]:
pandasDF = pd.read_csv(in_file)
sparkDF = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(in_file)

# Count operations: The difference is Panads shows # of rows per column; Spark is count # of rows

display("PandasDF Count", pandasDF.count())
display("SparkDF", sparkDF.count())


'PandasDF Count'

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

'SparkDF'

891

The difference between the .count methods are:
    1. Spark.count(): returns the # of rows in total
    2. Pandas.count(): returns the # of rows per column which are non none/NaN

## Displaying data in Pandas and Spark

Pandas will output nice clean data were spark might be unclean. For Spark specifically, don't use .tail() or .head() like how Pandas does, but use .show

In [234]:
pandasDF.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [235]:
sparkDF.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

## DataWrangling

In spark, dataframes are immutable and cannont be altered; you can however used .WithColumsn().Pandas however allows changes via []

In pandas, there's many ways to alter values.
    1. ".loc()": df.loc[df['column_name'] == value, 'column_name'] = 'your_value'
    2. ".apply() + lambda":df['column_name'].apply(lambda x: 'Value1' if condition else 'value 2'

### Pandas Wrangling

In [236]:
pandasDF2 =  pandasDF.copy()
#pandasDF2.loc[pandasDF2['Survived'] == 1, 'Survived'] = 'True'
#pandasDF2.loc[pandasDF2['Survived'] == 0, 'Survived'] = 'False'
pandasDF2["Survived"] = pandasDF2['Survived'].apply(lambda x: 'True' if x == 1 else 'False')
pandasDF2.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,True,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [237]:
pandasDF2 = pandasDF2.drop('Embarked', axis = 1)
pandasDF2.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
0,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,
1,2,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85


### Spark Wrangling

In [238]:
#sparkDF2 = sparkDF.withColumn("Survived", when(col("Survived").isNull or === "0","False").otherwise("True"))

In [239]:
# Load the Boston housing dataset
#data = pd.read_csv('housing.csv')
#prices = data['MEDV']
#features = data.drop('MEDV', axis = 1)
    

# sc.parallize() examples

In [240]:

# stop the SparkContext
sc.stop()