<a href="https://colab.research.google.com/github/arieljumba/Colab-Projects/blob/main/pyspark_pypy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#!pip install pyspark
#!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [12]:
import pyspark
import pandas as pd
import findspark

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName('first_spark_app')\
        .config("spark.some.config.option","2")\
        .getOrCreate()

In [14]:
df = spark.sparkContext\
    .parallelize([(1,2,3, 'a b c'),
                  (4,5,6, 'd e f'),
                  (7,8,9, 'g h i')])\
                  .toDF(['col1', 'col2', 'col3', 'col4'])
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



# PANDAS VS PYSPARK

## Create DataFrame

In [28]:
mylist = [['Allan','Male',None],['Kate','Female',32],['Alice','Female',29]]
col_names = ['name','gender','age']

In [30]:
#pandas
df_p1 = pd.DataFrame(mylist,columns = col_names)
df_p1

Unnamed: 0,name,gender,age
0,Allan,Male,
1,Kate,Female,32.0
2,Alice,Female,29.0


In [32]:
#spark
df_s1 = spark.createDataFrame(mylist,col_names)
df_s1.show()

+-----+------+----+
| name|gender| age|
+-----+------+----+
|Allan|  Male|null|
| Kate|Female|  32|
|Alice|Female|  29|
+-----+------+----+



## Load CSV Files

In [21]:
# pandas
df_p = pd.read_csv('advertising.csv')
df_p.head(3)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3


In [22]:
#pyspark
df_s = spark.read.format('csv').load('advertising.csv',header = True)
df_s.show(3)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
+-----+-----+---------+-----+
only showing top 3 rows



## Column names

In [24]:
#pandas
df_p.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [23]:
#pyspark
df_s.columns

['TV', 'Radio', 'Newspaper', 'Sales']

## Data Types

In [37]:
#pandas
df_p.dtypes

name       object
gender     object
age       float64
dtype: object

In [26]:
#pyspark
df_s.dtypes

[('TV', 'string'),
 ('Radio', 'string'),
 ('Newspaper', 'string'),
 ('Sales', 'string')]

## Fill Nulls

In [33]:
#pandas
df_p1.fillna(45)

Unnamed: 0,name,gender,age
0,Allan,Male,45.0
1,Kate,Female,32.0
2,Alice,Female,29.0


In [35]:
#spark
df_s1.fillna(45).show()

+-----+------+---+
| name|gender|age|
+-----+------+---+
|Allan|  Male| 45|
| Kate|Female| 32|
|Alice|Female| 29|
+-----+------+---+



## Replace Values

In [42]:
#pandas - mixed type replacements suppported
df_p1.name.replace(['Kate'],['Angie'],inplace=True)
df_p1

Unnamed: 0,name,gender,age
0,Allan,Male,
1,Angie,Female,32.0
2,Alice,Female,29.0


In [45]:
#spark - mixed type replacements not suppported
df_s1.replace(['Kate'],['Angie']).show()

+-----+------+----+
| name|gender| age|
+-----+------+----+
|Allan|  Male|null|
|Angie|Female|  32|
|Alice|Female|  29|
+-----+------+----+



## Renaming Multiple Columns

In [47]:
#pandas
new_cols = {'name':'A','gender':'B','age':'C'}
df_p1.rename(columns = new_cols).head()

Unnamed: 0,A,B,C
0,Allan,Male,
1,Angie,Female,32.0
2,Alice,Female,29.0


In [48]:
#spark
new_cols = {'name':'A','gender':'B','age':'C'}
new_cols_s = [new_cols.get(col,col) for col in df_s1.columns]
df_s1.toDF(*new_cols_s).show()

+-----+------+----+
|    A|     B|   C|
+-----+------+----+
|Allan|  Male|null|
| Kate|Female|  32|
|Alice|Female|  29|
+-----+------+----+



## Drop Columns

In [50]:
#pandas
df_p1.drop(['name','age'],axis =1).head()

Unnamed: 0,gender
0,Male
1,Female
2,Female


In [53]:
spark
df_s1.drop(*['name','age']).show()

+------+
|gender|
+------+
|  Male|
|Female|
|Female|
+------+



## Filter

In [62]:
#pandas
df_p1[df_p1.age<30].head()

Unnamed: 0,name,gender,age
2,Alice,Female,29.0


In [61]:
#spark
df_s1[df_s1.age<30].show()

+-----+------+---+
| name|gender|age|
+-----+------+---+
|Alice|Female| 29|
+-----+------+---+

