# Connect to Snowflake

In [2]:
from dotenv import load_dotenv
load_dotenv()     # loads keys into os.environ so the rest of your code sees them

True

In [3]:
# authenticate into Snowflake
from snowflake.snowpark import Session
import os
connection_parameters = {
    "account": os.getenv('SNOWFLAKE_ACCOUNT'),
    "user": os.getenv('SNOWFLAKE_USER'),
    "password": os.getenv('SNOWFLAKE_PASSWORD'),
    "role": os.getenv('SNOWFLAKE_ROLE'),
    "warehouse": os.getenv('SNOWFLAKE_WAREHOUSE'),
    "database": os.getenv('SNOWFLAKE_DATABASE'),
    "schema": os.getenv('SNOWFLAKE_SCHEMA')
}
session = Session.builder.configs(connection_parameters).create()

In [4]:
# check connection has been successful
print("Session Current Account:", session.get_current_account())

Session Current Account: "WEVIRIP-NA38028"


# Data Grouping

In [5]:
# load tables into session
marketing_final = session.table('MARKETING_FINAL')


In [5]:
# perform an aggregation
marketing_final.group_by('EDUCATION').mean('INCOME').show()

-------------------------------
|"EDUCATION"  |"AVG(INCOME)"  |
-------------------------------
|Graduation   |52720.373656   |
|Master       |52917.534247   |
|PhD          |56145.313929   |
|Basic        |20306.259259   |
|2n Cycle     |47633.190000   |
-------------------------------



In [7]:
# aggregate and alias a column
from snowflake.snowpark.functions import avg
marketing_final.group_by('EDUCATION').agg(avg('INCOME').alias('Avg_Income')).show()

------------------------------
|"EDUCATION"  |"AVG_INCOME"  |
------------------------------
|Graduation   |52720.373656  |
|Master       |52917.534247  |
|PhD          |56145.313929  |
|Basic        |20306.259259  |
|2n Cycle     |47633.190000  |
------------------------------



In [8]:
# use the function() method to pass the respective operation from Snowpark functions
from snowflake.snowpark.functions import sum
marketing_final.group_by('MARITAL_STATUS').function('sum')('Z_REVENUE').show()

---------------------------------------
|"MARITAL_STATUS"  |"SUM(Z_REVENUE)"  |
---------------------------------------
|Married           |9504              |
|Single            |5280              |
|Divorced          |2552              |
|Widow             |847               |
|Together          |6380              |
|YOLO              |22                |
|Absurd            |22                |
|Alone             |33                |
---------------------------------------



In [9]:
# calculate the maximum income by marital status
from snowflake.snowpark.functions import max
marketing_final.group_by('MARITAL_STATUS').agg(max('INCOME')).show()

------------------------------------
|"MARITAL_STATUS"  |"MAX(INCOME)"  |
------------------------------------
|Together          |666666         |
|Married           |160803         |
|YOLO              |48432          |
|Absurd            |79244          |
|Divorced          |153924         |
|Alone             |61331          |
|Single            |113734         |
|Widow             |85620          |
------------------------------------



In [11]:
# find the count of different types of graduates and their maximum income
from snowflake.snowpark.functions import col, count
marketing_final.group_by('EDUCATION').agg((col('*'), "count"), max('INCOME')).show()

----------------------------------------------------
|"EDUCATION"  |"COUNT(LITERAL())"  |"MAX(INCOME)"  |
----------------------------------------------------
|Graduation   |1127                |666666         |
|Master       |370                 |157733         |
|PhD          |486                 |162397         |
|Basic        |54                  |34445          |
|2n Cycle     |203                 |96547          |
----------------------------------------------------



In [12]:
# find out how people with different educations and marital statuses spend
marketing_final.group_by(['EDUCATION', 'MARITAL_STATUS']).agg(avg('INCOME').alias('Avg_Income'), sum('NUMSTOREPURCHASES').alias('Sum_Purchase')).show()

------------------------------------------------------------------
|"EDUCATION"  |"MARITAL_STATUS"  |"AVG_INCOME"  |"SUM_PURCHASE"  |
------------------------------------------------------------------
|PhD          |Single            |53314.614583  |568             |
|2n Cycle     |Divorced          |49395.130435  |138             |
|PhD          |Together          |56041.422414  |698             |
|Graduation   |Divorced          |54526.042017  |697             |
|PhD          |YOLO              |48432.000000  |12              |
|Basic        |Widow             |22123.000000  |3               |
|Master       |Married           |53286.028986  |815             |
|PhD          |Married           |58138.031579  |1191            |
|2n Cycle     |Married           |46201.100000  |440             |
|Master       |Divorced          |50331.945946  |203             |
------------------------------------------------------------------



In [13]:
# determine the relationship between EDUCATION, MARITAL_STATUS, and SUM_PURCHASE and sort results
aggregate_result = marketing_final.group_by(['EDUCATION', 'MARITAL_STATUS']).agg(avg('INCOME').alias('Avg_Income'), sum('NUMSTOREPURCHASES').alias('Sum_Purchase'))
aggregate_result.sort(col('EDUCATION').asc(), col('Sum_Purchase').asc()).show()

------------------------------------------------------------------
|"EDUCATION"  |"MARITAL_STATUS"  |"AVG_INCOME"  |"SUM_PURCHASE"  |
------------------------------------------------------------------
|2n Cycle     |Widow             |51392.200000  |37              |
|2n Cycle     |Divorced          |49395.130435  |138             |
|2n Cycle     |Single            |53673.944444  |194             |
|2n Cycle     |Together          |44736.410714  |309             |
|2n Cycle     |Married           |46201.100000  |440             |
|Basic        |Divorced          |9548.000000   |3               |
|Basic        |Widow             |22123.000000  |3               |
|Basic        |Together          |21240.071429  |34              |
|Basic        |Single            |18238.666667  |49              |
|Basic        |Married           |21960.500000  |65              |
------------------------------------------------------------------



# Data Analysis

In [6]:
# The describe() function in pandas is a valuable tool that helps us gain insights into the statistical properties of our numerical data.
marketing_final.describe().show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SUMMARY"  |"YEAR_BIRTH"        |"EDUCATION"  |"MARITAL_STATUS"  |"INCOME"            |"KIDHOME"           |"TEENHOME"          |"RECENCY"           |"MNTWINES"         |"MNTFRUITS"       |"MNTMEATPRODUCTS"   |"MNTFISHPRODUCTS"  |"MNTSWEETPRODUCTS"  |"MNTGOLDPRODS"     |"NUMDEALSPURCHASES"  |"NUMWEBPURCHASES"   |"NUMCATALOGPURCHASES"  |"NUMSTOREPURCHASES"  |"NUMWEBVISITSMONTH"  |"ID"                |"COMPLAIN"           |"Z_COSTCONTACT"  |"Z_REVENUE"  |
----------------------------------------------------------------------------------

In [7]:
# select distinct records
marketing_final.distinct().count()

2240

In [8]:
# drop_duplicates() removes duplicate rows from a Snowpark DataFrame
marketing_final.select(['Education','Marital_Status']).drop_duplicates().show()

----------------------------------
|"EDUCATION"  |"MARITAL_STATUS"  |
----------------------------------
|2n Cycle     |Married           |
|Graduation   |Married           |
|Graduation   |Together          |
|Graduation   |Divorced          |
|PhD          |Single            |
|PhD          |Alone             |
|PhD          |Together          |
|Master       |Married           |
|PhD          |Married           |
|Master       |Together          |
----------------------------------



In [9]:
# crosstab analysis
# Once we have identified the unique combinations of the EDUCATION and MARITAL_STATUS columns in our dataset, 
# we might still be curious about how frequently each combination occurs.
marketing_final.stat.crosstab(col1='Education',col2='Marital_Status').show()

------------------------------------------------------------------------------------------------------------------------
|"EDUCATION"  |"'Together'"  |"'Married'"  |"'YOLO'"  |"'Absurd'"  |"'Divorced'"  |"'Alone'"  |"'Single'"  |"'Widow'"  |
------------------------------------------------------------------------------------------------------------------------
|Graduation   |286           |433          |0         |1           |119           |1          |252         |35         |
|Master       |106           |138          |0         |1           |37            |1          |75          |12         |
|PhD          |117           |192          |2         |0           |52            |1          |98          |24         |
|2n Cycle     |57            |81           |0         |0           |23            |0          |37          |5          |
|Basic        |14            |20           |0         |0           |1             |0          |18          |1          |
--------------------------------

In [10]:
# pivot analysis
market_subset = marketing_final.select('EDUCATION','MARITAL_STATUS','INCOME')
market_pivot = market_subset.pivot(
    'EDUCATION',
    ['Graduation','PhD','Master','Basic','2n Cycle']
).sum('INCOME')
market_pivot.show()

----------------------------------------------------------------------------------------
|"MARITAL_STATUS"  |"'Graduation'"  |"'PhD'"   |"'Master'"  |"'Basic'"  |"'2n Cycle'"  |
----------------------------------------------------------------------------------------
|YOLO              |NULL            |96864     |NULL        |NULL       |NULL          |
|Married           |21793311        |11046226  |7353472     |439210     |3696088       |
|Together          |15891167        |6500805   |5315119     |297361     |2505239       |
|Single            |12625257        |5118203   |4014792     |328296     |1932262       |
|Divorced          |6488599         |2761024   |1862282     |9548       |1136088       |
|Widow             |1924183         |1446914   |642417      |22123      |256961        |
|Alone             |34176           |35860     |61331       |NULL       |NULL          |
|Absurd            |79244           |NULL      |65487       |NULL       |NULL          |
---------------------

# Close Snowflake Session

In [11]:
# always close a session
session.close()