In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812366 sha256=82faf4f43ef13c44e548fcbd64406c9eca9c93b173b4bcf97713a349f1ca0176
  Stored in directory: /Users/aravindh/Library/Caches/pip/wheels/9d/29/ee/3a756632ca3f0a6870933bac1c9db6e4af2c068f019aba0ee1
Successfully built pyspark
Installing collected pack

In [6]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum, min, max
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [8]:
import sys
sys.path.append("/path/to/spark/python")
sys.path.append("/path/to/spark/python/lib/py4j-<version>-src.zip")

In [11]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("BankingAnalysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/15 20:15:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
# Define the schema for the dataset
schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("surname", StringType(), True),
    StructField("credit_score", IntegerType(), True),
    StructField("geography", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("tenure", IntegerType(), True),
    StructField("balance", DoubleType(), True),
    StructField("num_of_products", IntegerType(), True),
    StructField("has_credit_card", IntegerType(), True),
    StructField("estimated_salary", DoubleType(), True),
    StructField("exited", IntegerType(), True)
])

In [15]:
#Read the CSV file into a DataFrame
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load("/Users/aravindh/Desktop/Data Engineering/Azure Mini Project/springboard-pyspark-project/pyspark-project/credit card.csv")

In [16]:
# Create a BankingAnalysis class
class BankingAnalysis:
    def __init__(self, dataframe):
        self.df = dataframe
    
    def total_customers(self):
        return self.df.count()
    
    def customers_by_geography(self):
        return self.df.groupBy("geography").agg(count("*").alias("num_customers"))
    
    def customers_by_gender(self):
        return self.df.groupBy("gender").agg(count("*").alias("num_customers"))
    
    def avg_age_by_geography(self):
        return self.df.groupBy("geography").agg(avg("age").alias("avg_age"))
    
    def avg_balance_by_geography(self):
        return self.df.groupBy("geography").agg(avg("balance").alias("avg_balance"))
    
    def min_max_tenure(self):
        return self.df.agg(min("tenure").alias("min_tenure"), max("tenure").alias("max_tenure"))
    
    def num_credit_card_holders(self):
        return self.df.filter(col("has_credit_card") == 1).count()
    
    def avg_salary_by_gender(self):
        return self.df.groupBy("gender").agg(avg("estimated_salary").alias("avg_salary"))
    
    def num_exited_customers(self):
        return self.df.filter(col("exited") == 1).count()
    
    def total_balance(self):
        return self.df.agg(sum("balance").alias("total_balance")).collect()[0][0]

In [17]:
# Create an instance of the BankingAnalysis class
analysis = BankingAnalysis(df)

In [18]:
# Perform analysis and print the results
print("Total number of customers:", analysis.total_customers())
print("Number of customers by geography:")
analysis.customers_by_geography().show()
print("Number of customers by gender:")
analysis.customers_by_gender().show()
print("Average age by geography:")
analysis.avg_age_by_geography().show()
print("Average balance by geography:")
analysis.avg_balance_by_geography().show()
print("Minimum and maximum tenure:", analysis.min_max_tenure().collect())
print("Number of credit card holders:", analysis.num_credit_card_holders())
print("Average salary by gender:")
analysis.avg_salary_by_gender().show()
print("Number of customers who have exited:", analysis.num_exited_customers())
print("Total balance across all customers:", analysis.total_balance())

Total number of customers: 10000
Number of customers by geography:


24/09/15 20:17:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: CreditScore
 Schema: geography
Expected: geography but found: CreditScore
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv


+---------+-------------+
|geography|num_customers|
+---------+-------------+
|      829|            8|
|      675|           37|
|      691|           34|
|      467|            4|
|      800|           10|
|      451|            5|
|      666|           38|
|      591|           31|
|      447|            4|
|      574|           21|
|      475|            6|
|      718|           38|
|      613|           42|
|      577|           34|
|      581|           38|
|      544|           25|
|      747|           22|
|      740|           19|
|      647|           31|
|      711|           39|
+---------+-------------+
only showing top 20 rows

Number of customers by gender:
+-------+-------------+
| gender|num_customers|
+-------+-------------+
|Germany|         2509|
| France|         5014|
|  Spain|         2477|
+-------+-------------+

Average age by geography:


24/09/15 20:17:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Geography
 Schema: gender
Expected: gender but found: Geography
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv
24/09/15 20:17:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: CreditScore, Gender
 Schema: geography, age
Expected: geography but found: CreditScore
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv


CodeCache: size=131072Kb used=22294Kb max_used=22409Kb free=108777Kb
 bounds [0x00000001041a8000, 0x00000001057b8000, 0x000000010c1a8000]
 total_blobs=9117 nmethods=8174 adapters=855
 compilation: disabled (not enough contiguous free space left)
+---------+-------+
|geography|avg_age|
+---------+-------+
|      829|   NULL|
|      675|   NULL|
|      691|   NULL|
|      467|   NULL|
|      800|   NULL|
|      451|   NULL|
|      666|   NULL|
|      591|   NULL|
|      447|   NULL|
|      574|   NULL|
|      475|   NULL|
|      718|   NULL|
|      613|   NULL|
|      577|   NULL|
|      581|   NULL|
|      544|   NULL|
|      747|   NULL|
|      740|   NULL|
|      647|   NULL|
|      711|   NULL|
+---------+-------+
only showing top 20 rows

Average balance by geography:


24/09/15 20:17:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: CreditScore, Tenure
 Schema: geography, balance
Expected: geography but found: CreditScore
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv


+---------+------------------+
|geography|       avg_balance|
+---------+------------------+
|      829|              6.25|
|      675| 5.216216216216216|
|      691| 5.617647058823529|
|      467|               6.5|
|      800|               4.5|
|      451|               6.6|
|      666|4.7105263157894735|
|      591| 5.419354838709677|
|      447|               5.0|
|      574| 4.333333333333333|
|      475| 5.333333333333333|
|      718|5.2894736842105265|
|      613| 5.761904761904762|
|      577| 4.823529411764706|
|      581| 4.026315789473684|
|      544|              5.24|
|      747| 5.454545454545454|
|      740| 5.473684210526316|
|      647| 4.903225806451613|
|      711| 5.051282051282051|
+---------+------------------+
only showing top 20 rows

Minimum and maximum tenure: [Row(min_tenure=18, max_tenure=92)]


24/09/15 20:17:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Age
 Schema: tenure
Expected: tenure but found: Age
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv
24/09/15 20:17:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NumOfProducts
 Schema: has_credit_card
Expected: has_credit_card but found: NumOfProducts
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv
24/09/15 20:17:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Geography, IsActiveMember
 Schema: gender, estimated_salary
Expected: gender but found: Geography
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv


Number of credit card holders: 5084
Average salary by gender:
+-------+-------------------+
| gender|         avg_salary|
+-------+-------------------+
|Germany|0.49740932642487046|
| France| 0.5167530913442362|
|  Spain| 0.5296729915220024|
+-------+-------------------+

Number of customers who have exited: 0
Total balance across all customers: 50128.0


24/09/15 20:17:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: EstimatedSalary
 Schema: exited
Expected: exited but found: EstimatedSalary
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv
24/09/15 20:17:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Tenure
 Schema: balance
Expected: balance but found: Tenure
CSV file: file:///Users/aravindh/Desktop/Data%20Engineering/Azure%20Mini%20Project/springboard-pyspark-project/pyspark-project/credit%20card.csv
