<a href="https://colab.research.google.com/github/armahdavi/BigData_pyspark_practice/blob/main/pySpark_initial_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, count, broadcast

In [2]:
# Create a Spark Session
spark = (SparkSession
         .builder
         .appName("SparkSQLExampleApp")
         .getOrCreate())

# Path the to dataframe
csv_file = "/content/sample_data/onlinefoods.csv"

df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file)

# Create a temporary view to run SQL queries
df.createOrReplaceGlobalTempView('u')

In [3]:
df.show()

+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|_c12|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
| 20|Female|        Single|       Student|      No Income|             Post Graduate|          4| 12.9766|  77.5993|  560001|   Yes| Positive| Yes|
| 24|Female|        Single|       Student| Below Rs.10000|                  Graduate|          3|  12.977|  77.5773|  560009|   Yes| Positive| Yes|
| 22|  Male|        Single|       Student| Below Rs.10000|             Post Graduate|          3| 12.9551|  77.6593|  560017|   Yes|Negative | Yes|
| 22|Female|        Single|       Student|      No Income|                  Graduate|          6| 12.9473|  77.5

In [4]:
# comparison with Pandas. Display format is different
import pandas as pd

df2 = pd.read_csv(csv_file)
df2.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,Unnamed: 12
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes


In [5]:
print(type(df), type(df2))

<class 'pyspark.sql.dataframe.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [6]:
print(df.columns)
print(df.count())

# df.except

['Age', 'Gender', 'Marital Status', 'Occupation', 'Monthly Income', 'Educational Qualifications', 'Family size', 'latitude', 'longitude', 'Pin code', 'Output', 'Feedback', '_c12']
388


In [7]:
df.head(10)

[Row(Age=20, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='No Income', Educational Qualifications='Post Graduate', Family size=4, latitude=12.9766, longitude=77.5993, Pin code=560001, Output='Yes', Feedback='Positive', _c12='Yes'),
 Row(Age=24, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='Below Rs.10000', Educational Qualifications='Graduate', Family size=3, latitude=12.977, longitude=77.5773, Pin code=560009, Output='Yes', Feedback='Positive', _c12='Yes'),
 Row(Age=22, Gender='Male', Marital Status='Single', Occupation='Student', Monthly Income='Below Rs.10000', Educational Qualifications='Post Graduate', Family size=3, latitude=12.9551, longitude=77.6593, Pin code=560017, Output='Yes', Feedback='Negative ', _c12='Yes'),
 Row(Age=22, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='No Income', Educational Qualifications='Graduate', Family size=6, latitude=12.9473, longitude=77.5616, Pin

In [8]:
df.describe().show()

+-------+------------------+------+--------------+----------+--------------+--------------------------+------------------+--------------------+------------------+------------------+------+---------+----+
|summary|               Age|Gender|Marital Status|Occupation|Monthly Income|Educational Qualifications|       Family size|            latitude|         longitude|          Pin code|Output| Feedback|_c12|
+-------+------------------+------+--------------+----------+--------------+--------------------------+------------------+--------------------+------------------+------------------+------+---------+----+
|  count|               388|   388|           388|       388|           388|                       388|               388|                 388|               388|               388|   388|      388| 388|
|   mean|24.628865979381445|  NULL|          NULL|      NULL|          NULL|                      NULL|3.2809278350515463|  12.972057989690706| 77.60015953608251| 560040.1134020619|  N

In [9]:
df.isEmpty()


False

In [10]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Monthly Income: string (nullable = true)
 |-- Educational Qualifications: string (nullable = true)
 |-- Family size: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Pin code: integer (nullable = true)
 |-- Output: string (nullable = true)
 |-- Feedback: string (nullable = true)
 |-- _c12: string (nullable = true)



In [11]:
df.exceptAll

In [12]:
new_df = df.select('Age', 'Monthly Income')
new_df.show()

+---+---------------+
|Age| Monthly Income|
+---+---------------+
| 20|      No Income|
| 24| Below Rs.10000|
| 22| Below Rs.10000|
| 22|      No Income|
| 22| Below Rs.10000|
| 27|More than 50000|
| 22|      No Income|
| 24|      No Income|
| 23|      No Income|
| 23|      No Income|
| 22|      No Income|
| 23| Below Rs.10000|
| 23|      No Income|
| 21|      No Income|
| 23| 10001 to 25000|
| 24|      No Income|
| 28| 25001 to 50000|
| 23|      No Income|
| 25|      No Income|
| 21| Below Rs.10000|
+---+---------------+
only showing top 20 rows



In [13]:
first_two_rows = df.take(2)

for row in first_two_rows:
  print(row)

Row(Age=20, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='No Income', Educational Qualifications='Post Graduate', Family size=4, latitude=12.9766, longitude=77.5993, Pin code=560001, Output='Yes', Feedback='Positive', _c12='Yes')
Row(Age=24, Gender='Female', Marital Status='Single', Occupation='Student', Monthly Income='Below Rs.10000', Educational Qualifications='Graduate', Family size=3, latitude=12.977, longitude=77.5773, Pin code=560009, Output='Yes', Feedback='Positive', _c12='Yes')


In [14]:
first_row = df.first()

# Convert the row to a dictionary
row_dict = first_row.asDict()
print(row_dict)
type(row_dict)

{'Age': 20, 'Gender': 'Female', 'Marital Status': 'Single', 'Occupation': 'Student', 'Monthly Income': 'No Income', 'Educational Qualifications': 'Post Graduate', 'Family size': 4, 'latitude': 12.9766, 'longitude': 77.5993, 'Pin code': 560001, 'Output': 'Yes', 'Feedback': 'Positive', '_c12': 'Yes'}


dict

In [15]:
df = df.withColumn('Post_Graduate_customer',
                   when(df['Educational Qualifications'] == 'Post Graduate',
                        'Yes')
                   .otherwise('No'))
df.show()

+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+----------------------+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|_c12|Post_Graduate_customer|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+----------------------+
| 20|Female|        Single|       Student|      No Income|             Post Graduate|          4| 12.9766|  77.5993|  560001|   Yes| Positive| Yes|                   Yes|
| 24|Female|        Single|       Student| Below Rs.10000|                  Graduate|          3|  12.977|  77.5773|  560009|   Yes| Positive| Yes|                    No|
| 22|  Male|        Single|       Student| Below Rs.10000|             Post Graduate|          3| 12.9551|  77.6593|  560017|   Yes|Negative | Ye

In [16]:
df.drop('Post_Graduate_customer').show()

+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|_c12|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
| 20|Female|        Single|       Student|      No Income|             Post Graduate|          4| 12.9766|  77.5993|  560001|   Yes| Positive| Yes|
| 24|Female|        Single|       Student| Below Rs.10000|                  Graduate|          3|  12.977|  77.5773|  560009|   Yes| Positive| Yes|
| 22|  Male|        Single|       Student| Below Rs.10000|             Post Graduate|          3| 12.9551|  77.6593|  560017|   Yes|Negative | Yes|
| 22|Female|        Single|       Student|      No Income|                  Graduate|          6| 12.9473|  77.5

In [17]:
distinct_df = df.distinct()

print('DataFrame aftee removing duplicates:')
distinct_df.show()

DataFrame aftee removing duplicates:
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+----------------------+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|_c12|Post_Graduate_customer|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+----------------------+
| 26|Female|       Married|      Employee| 25001 to 50000|                  Graduate|          3|  13.014|  77.5658|  560012|   Yes| Positive| Yes|                    No|
| 22|  Male|        Single|       Student|      No Income|             Post Graduate|          1| 13.0626|  77.5284|  560015|   Yes| Positive| Yes|                   Yes|
| 23|Female|        Single|       Student|      No Income|             Post Graduate|          2|  12.977|  

In [18]:
# Creating dataframe from the scratch
data = [('Alice', 34, 'Single'),
        ('Bob', 45, 'Married'),
        ('Charlier', 28, 'Single')]

columns = ['Name', 'Age', 'MaritalStatus']
df = spark.createDataFrame(data, columns)

# Register the DataFrame as a temporary view
df.createOrReplaceTempView('people')

# Execute a SQL query using spark.sql()
result = spark.sql('SELECT * FROM people WHERE Age > 30')

# Show the result
result.show()

+-----+---+-------------+
| Name|Age|MaritalStatus|
+-----+---+-------------+
|Alice| 34|       Single|
|  Bob| 45|      Married|
+-----+---+-------------+



In [19]:
df.describe().show()

+-------+--------+------------------+-------------+
|summary|    Name|               Age|MaritalStatus|
+-------+--------+------------------+-------------+
|  count|       3|                 3|            3|
|   mean|    NULL|35.666666666666664|         NULL|
| stddev|    NULL| 8.621678104251709|         NULL|
|    min|   Alice|                28|      Married|
|    max|Charlier|                45|       Single|
+-------+--------+------------------+-------------+



In [20]:
print(df.dtypes) # see the difference with pandas df

[('Name', 'string'), ('Age', 'bigint'), ('MaritalStatus', 'string')]


In [21]:
filtered_df = df.filter((df['Age'] > 30) & (df['MaritalStatus'] == 'Single'))
filtered_df.show()

+-----+---+-------------+
| Name|Age|MaritalStatus|
+-----+---+-------------+
|Alice| 34|       Single|
+-----+---+-------------+



In [22]:
collected_data = df.collect()
print(collected_data)
# type(collected_data) # it is a list. Really? For big data, you want to see 1M full observations in a list

[Row(Name='Alice', Age=34, MaritalStatus='Single'), Row(Name='Bob', Age=45, MaritalStatus='Married'), Row(Name='Charlier', Age=28, MaritalStatus='Single')]


In [23]:
# Create a DataFrame
data = [('Alice', 34, 'Single'),
        ('Bob', 45, 'Married'),
        ('Charlier', 28, 'Single'),
        ('David', 37, 'Divorced'),
        ('Eve', 52, 'Married')]

columns = ['Name', 'Age', 'MaritalStatus']
df = spark.createDataFrame(data, columns)

# Use agg() to compute the count of each marital status
marital_status_counts = df.groupBy('MaritalStatus').agg(count('*').alias('Count'))

# Show the result
marital_status_counts.show()

+-------------+-----+
|MaritalStatus|Count|
+-------------+-----+
|      Married|    2|
|       Single|    2|
|     Divorced|    1|
+-------------+-----+



In [24]:
data1 = [('Alice', 34, 'Single'),
        ('Bob', 45, 'Married'),
        ('Charlier', 28, 'Single')]

columns1 = ['Name', 'Age', 'MaritalStatus']
df1 = spark.createDataFrame(data1, columns1)

# Example of data for the second DataFrame
data2 = [('Alice', 'New York'),
         ('Bob', 'Los Angeles'),
         ('Charlier', 'Chicago')]

columns2 = ['Name', 'City']
df2 = spark.createDataFrame(data2, columns2)

joined_df = df1.join(df2, 'Name', 'outer')

# Show the joined DataFrame
joined_df.show()

+--------+---+-------------+-----------+
|    Name|Age|MaritalStatus|       City|
+--------+---+-------------+-----------+
|   Alice| 34|       Single|   New York|
|     Bob| 45|      Married|Los Angeles|
|Charlier| 28|       Single|    Chicago|
+--------+---+-------------+-----------+



In [25]:
# Example of data for the first DataFrame
data1 = [('Alice', 34, 'Single'),
        ('Bob', 45, 'Married'),
        ('Charlier', 28, 'Single')]

columns1 = ['Name', 'Age', 'MaritalStatus']
df1 = spark.createDataFrame(data1, columns1)

# Example of data for the second DataFrame
data2 = [('New York',),
         ('Los Angeles',),
         ('Chicago',)]

columns2 = ['City']
df2 = spark.createDataFrame(data2, columns2)

# Use union() to combine the two DataFrames
cross_join_df = df1.crossJoin(df2.select('*'))

# Show the combined DataFrame
cross_join_df.show()

+--------+---+-------------+-----------+
|    Name|Age|MaritalStatus|       City|
+--------+---+-------------+-----------+
|   Alice| 34|       Single|   New York|
|   Alice| 34|       Single|Los Angeles|
|   Alice| 34|       Single|    Chicago|
|     Bob| 45|      Married|   New York|
|Charlier| 28|       Single|   New York|
|     Bob| 45|      Married|Los Angeles|
|     Bob| 45|      Married|    Chicago|
|Charlier| 28|       Single|Los Angeles|
|Charlier| 28|       Single|    Chicago|
+--------+---+-------------+-----------+



In [26]:
# Example data for the first DataFrame (small)
data1 = [('Alice', 34),
        ('Bob', 45),
        ('Charlier', 28)]

columns1 = ['Name', 'Age']
df1 = spark.createDataFrame(data1, columns1)

# Example data for the second DataFrame (large)
data2 = [('Alice', 'New York'),
         ('Bob', 'Los Angeles'),
         ('Charlier', 'Chicago'),
         ('David', 'Houston'),
         ('Eve', 'San Francisco')]

columns2 = ['Name', 'City']
df2 = spark.createDataFrame(data2, columns2)

broadcast_df1 = broadcast(df1)
print(broadcast_df1.show())
print(df1.show())

joined_df = broadcast_df1.join(df2, 'Name', 'outer')

# Show the joined DataFrame
joined_df.show()

+--------+---+
|    Name|Age|
+--------+---+
|   Alice| 34|
|     Bob| 45|
|Charlier| 28|
+--------+---+

None
+--------+---+
|    Name|Age|
+--------+---+
|   Alice| 34|
|     Bob| 45|
|Charlier| 28|
+--------+---+

None
+--------+----+-------------+
|    Name| Age|         City|
+--------+----+-------------+
|   Alice|  34|     New York|
|     Bob|  45|  Los Angeles|
|Charlier|  28|      Chicago|
|   David|NULL|      Houston|
|     Eve|NULL|San Francisco|
+--------+----+-------------+



In [27]:
df_alias = df.alias('Person')
df_alias.show()

+--------+---+-------------+
|    Name|Age|MaritalStatus|
+--------+---+-------------+
|   Alice| 34|       Single|
|     Bob| 45|      Married|
|Charlier| 28|       Single|
|   David| 37|     Divorced|
|     Eve| 52|      Married|
+--------+---+-------------+



In [28]:
# Example data
data = [('Alice', 'Sales', 5000),
        ('Bob', 'Sales', 6000),
        ('Charlier', 'Marketing', 4000),
        ('David', 'Marketing', 3000)]

columns = ['Name', 'Department', 'Salary']
df = spark.createDataFrame(data, columns)

# Perform rollup aggregation across 'Department' and 'Name;
rollup_df = df.rollup('Department', 'Name').sum('Salary')

# Show the resulting DataFrame
rollup_df.show()


+----------+--------+-----------+
|Department|    Name|sum(Salary)|
+----------+--------+-----------+
|     Sales|     Bob|       6000|
|      NULL|    NULL|      18000|
|     Sales|   Alice|       5000|
|     Sales|    NULL|      11000|
| Marketing|Charlier|       4000|
| Marketing|    NULL|       7000|
| Marketing|   David|       3000|
+----------+--------+-----------+



In [29]:
# Example data
data = [('Alice', 'Sales', 5000),
        ('Bob', 'Sales', 6000),
        ('Charlier', 'Marketing', 4000),
        ('David', 'Marketing', 3000)]

columns = ['Name', 'Department', 'Salary']
df = spark.createDataFrame(data, columns)

# Perform cube aggregation across 'Department' and 'Name;
cube_df = df.cube('Department', 'Name').sum('Salary')

# Show the resulting DataFrame
cube_df.show()


+----------+--------+-----------+
|Department|    Name|sum(Salary)|
+----------+--------+-----------+
|      NULL|   Alice|       5000|
|     Sales|     Bob|       6000|
|      NULL|     Bob|       6000|
|      NULL|    NULL|      18000|
|     Sales|   Alice|       5000|
|     Sales|    NULL|      11000|
|      NULL|Charlier|       4000|
| Marketing|Charlier|       4000|
| Marketing|    NULL|       7000|
|      NULL|   David|       3000|
| Marketing|   David|       3000|
+----------+--------+-----------+



In [30]:
# Example data
data = [('Alice', 'Sales', 5000),
        ('Bob', 'Sales', 6000),
        ('Charlier', 'Marketing', 4000),
        ('David', 'Marketing', 3000)]

columns = ['Name', 'Department', 'Salary']
df = spark.createDataFrame(data, columns)

# Perform pivot operation on 'Name' column
pivot_df1 = df.groupBy('Department').pivot('Name').sum('Salary')
# Show the resulting DataFrame
pivot_df1.show()


# Perform pivot operation on 'Department' column
pivot_df2 = df.groupBy('Name').pivot('Department').sum('Salary')
# Show the resulting DataFrame
pivot_df2.show()

+----------+-----+----+--------+-----+
|Department|Alice| Bob|Charlier|David|
+----------+-----+----+--------+-----+
|     Sales| 5000|6000|    NULL| NULL|
| Marketing| NULL|NULL|    4000| 3000|
+----------+-----+----+--------+-----+

+--------+---------+-----+
|    Name|Marketing|Sales|
+--------+---------+-----+
|Charlier|     4000| NULL|
|     Bob|     NULL| 6000|
|   Alice|     NULL| 5000|
|   David|     3000| NULL|
+--------+---------+-----+



In [31]:
# Example of data
data = [('Alice', 34, 'Single'),
        ('Bob', 45, 'Married'),
        ('Charlier', 28, 'Single')]

columns = ['Name', 'Age', 'MaritalStatus']
df = spark.createDataFrame(data, columns)

# Sort the DataFrame by the 'Age' column in ascending order
sorted_df1 = df.sort('Age')
sorted_df1.show()

sorted_df2 = df.orderBy('Age')
sorted_df2.show()

+--------+---+-------------+
|    Name|Age|MaritalStatus|
+--------+---+-------------+
|Charlier| 28|       Single|
|   Alice| 34|       Single|
|     Bob| 45|      Married|
+--------+---+-------------+

+--------+---+-------------+
|    Name|Age|MaritalStatus|
+--------+---+-------------+
|Charlier| 28|       Single|
|   Alice| 34|       Single|
|     Bob| 45|      Married|
+--------+---+-------------+



In [32]:
# Example of data
data = [('Alice', 34, 'Single'),
        ('Bob', 45, 'Married'),
        ('Alice', 34, 'Single'),
        ('Charlier', 28, 'Single')]

columns = ['Name', 'Age', 'MaritalStatus']
df = spark.createDataFrame(data, columns)

# Drop duplicate rows considering all columns
deduplicated_df = df.dropDuplicates()
deduplicated_df.show()

+--------+---+-------------+
|    Name|Age|MaritalStatus|
+--------+---+-------------+
|   Alice| 34|       Single|
|     Bob| 45|      Married|
|Charlier| 28|       Single|
+--------+---+-------------+

