# Salesforce ETL Project - Part 2 (Student & Class Participant)

In [1]:
#!pip install simple_salesforce
#!pip install pymysql
#!pip install sqlalchemy

In [2]:
import json
from sqlalchemy import create_engine
from datetime import date
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()

In [3]:
# Make sure to use your own `config.py` file. Consider ensuring that these variable names are in sync
from config import sf_username, sf_password, sf_security_token
from config import remote_db_endpoint, remote_db_port
from config import remote_db_name, remote_db_user, remote_db_pwd

In [4]:
from simple_salesforce import Salesforce
sf = Salesforce(username=sf_username, password=sf_password, security_token=sf_security_token)

In [5]:
engine = create_engine(f"mysql://{remote_db_user}:{remote_db_pwd}@{remote_db_endpoint}:{remote_db_port}/{remote_db_name}")
conn = engine.connect()

## Prepare ETL for the Student data

#### Student table on MySQL
![Salesforce ETL Project - MySQL Table Student](Images/MySQL_Table_Student.jpg)

In [6]:
student_data_df = pd.read_sql("SELECT * FROM student", conn)
student_data_df.head()

Unnamed: 0,ID_Student,StudentID,LastName,FirstName,MiddleName,BirthDate,Gender
0,33,25004961,Dartling,Heather,Alice,,F
1,34,25003514,Dartling,Lana,Cecille,,F
2,35,25005833,Dartling,Jessica,Dorothy,,F
3,36,25002589,Dartling,Kimberly,Genevieve,,F
4,37,25007185,Dartling,Katherine,Lynnette,,F


In [7]:
# student_data_df
# student_data_df['BirthDate'] = pd.to_datetime(student_data_df['BirthDate']).dt.date
# student_data_df.head()

In [8]:
student_data_df.rename(columns={
    'StudentID':'Student_ID__c',
    'FirstName':'First_Name__c',
    'LastName':'Last_Name__c',
    'MiddleName':'Middle_Name__c',
    'BirthDate':'Birth_Date__c',
    'Gender':'Gender__c',    
}, inplace=True)
student_data_df.head()

Unnamed: 0,ID_Student,Student_ID__c,Last_Name__c,First_Name__c,Middle_Name__c,Birth_Date__c,Gender__c
0,33,25004961,Dartling,Heather,Alice,,F
1,34,25003514,Dartling,Lana,Cecille,,F
2,35,25005833,Dartling,Jessica,Dorothy,,F
3,36,25002589,Dartling,Kimberly,Genevieve,,F
4,37,25007185,Dartling,Katherine,Lynnette,,F


In [9]:
student_data_df = student_data_df[['Student_ID__c', 'Last_Name__c', 'First_Name__c', 'Middle_Name__c', 'Birth_Date__c', 'Gender__c']]
student_data_df.head()

Unnamed: 0,Student_ID__c,Last_Name__c,First_Name__c,Middle_Name__c,Birth_Date__c,Gender__c
0,25004961,Dartling,Heather,Alice,,F
1,25003514,Dartling,Lana,Cecille,,F
2,25005833,Dartling,Jessica,Dorothy,,F
3,25002589,Dartling,Kimberly,Genevieve,,F
4,25007185,Dartling,Katherine,Lynnette,,F


In [10]:
student_data_records = student_data_df.to_dict('records')
student_data_records

[{'Student_ID__c': '25004961',
  'Last_Name__c': 'Dartling',
  'First_Name__c': 'Heather',
  'Middle_Name__c': 'Alice',
  'Birth_Date__c': None,
  'Gender__c': 'F'},
 {'Student_ID__c': '25003514',
  'Last_Name__c': 'Dartling',
  'First_Name__c': 'Lana',
  'Middle_Name__c': 'Cecille',
  'Birth_Date__c': None,
  'Gender__c': 'F'},
 {'Student_ID__c': '25005833',
  'Last_Name__c': 'Dartling',
  'First_Name__c': 'Jessica',
  'Middle_Name__c': 'Dorothy',
  'Birth_Date__c': None,
  'Gender__c': 'F'},
 {'Student_ID__c': '25002589',
  'Last_Name__c': 'Dartling',
  'First_Name__c': 'Kimberly',
  'Middle_Name__c': 'Genevieve',
  'Birth_Date__c': None,
  'Gender__c': 'F'},
 {'Student_ID__c': '25007185',
  'Last_Name__c': 'Dartling',
  'First_Name__c': 'Katherine',
  'Middle_Name__c': 'Lynnette',
  'Birth_Date__c': None,
  'Gender__c': 'F'},
 {'Student_ID__c': '25006014',
  'Last_Name__c': 'Dartling',
  'First_Name__c': 'Precious',
  'Middle_Name__c': 'Mariette',
  'Birth_Date__c': None,
  'Gender_

## Insert `Student` Records into Salesforce

In [11]:
for rec in student_data_records:

    record = {
        'Student_ID__c': rec['Student_ID__c'],
        'Last_Name__c': rec['Last_Name__c'],
        'First_Name__c': rec['First_Name__c'],
        'Middle_Name__c': rec['Middle_Name__c'],
        'Birth_Date__c': rec['Birth_Date__c'],
        'Gender__c': rec['Gender__c']
    }
    
    try:
        sf.Student__c.create(record)
    except Exception as e:
        print(e)

In [12]:
# Bulk 
#sf.bulk.Student__c.insert(student_data_records)

#### Student object / table on Salesforce
![Salesforce ETL Project - Salesforce Object Student](Images/SF_Object_Student.jpg)

## Create Student Lookup Table

It is important to note that we will be querying **Salesforce** to retrieve the record IDs 

In [13]:
student_lookup_list = []

# The `Name` column in the primary key in Salesforce objects
# The Salesforce query language is called SOQL 
data = sf.query_all_iter("SELECT Student_ID__c, Name FROM Student__c")
for row in data:
    rec = {
        'ID_Student__c': row['Name'], # this is a critical line of code
        'Student_ID__c': row['Student_ID__c']
    }
    student_lookup_list.append(rec)
    
student_lookup_list

[{'ID_Student__c': 'a054x0000010eyb', 'Student_ID__c': '25007333'},
 {'ID_Student__c': 'a054x0000010exs', 'Student_ID__c': '25002311'},
 {'ID_Student__c': 'a054x0000010eyR', 'Student_ID__c': '25003936'},
 {'ID_Student__c': 'a054x0000010eyM', 'Student_ID__c': '25003773'},
 {'ID_Student__c': 'a054x0000010eyH', 'Student_ID__c': '25002325'},
 {'ID_Student__c': 'a054x0000010exn', 'Student_ID__c': '25003605'},
 {'ID_Student__c': 'a054x0000010erw', 'Student_ID__c': '25002056'},
 {'ID_Student__c': 'a054x0000010ey2', 'Student_ID__c': '25002714'},
 {'ID_Student__c': 'a054x0000010eup', 'Student_ID__c': '25002876'},
 {'ID_Student__c': 'a054x0000010etX', 'Student_ID__c': '25006805'},
 {'ID_Student__c': 'a054x0000010ey7', 'Student_ID__c': '25005284'},
 {'ID_Student__c': 'a054x0000010eio', 'Student_ID__c': '25005250'},
 {'ID_Student__c': 'a054x0000010exi', 'Student_ID__c': '25003778'},
 {'ID_Student__c': 'a054x0000010euB', 'Student_ID__c': '25002555'},
 {'ID_Student__c': 'a054x0000010exJ', 'Student_I

In [14]:
student_lookup_df = pd.DataFrame(student_lookup_list)
student_lookup_df.head()

Unnamed: 0,ID_Student__c,Student_ID__c
0,a054x0000010eyb,25007333
1,a054x0000010exs,25002311
2,a054x0000010eyR,25003936
3,a054x0000010eyM,25003773
4,a054x0000010eyH,25002325


## Create Class & Course Joint Lookup Table

It is important to note that we will be querying **Salesforce** to retrieve the record IDs 

In [15]:
course_lookup_list = []

# The `Name` column in the primary key in Salesforce objects
# The Salesforce query language is called SOQL 
data = sf.query_all_iter("SELECT Course_Code__c, Name FROM Course__c")
for row in data:
    rec = {
        'ID_Course__c': row['Name'], # this is a critical line of code
        'Course_Code__c': row['Course_Code__c'] # this is not necessary even though it's also a unique identifier
    }
    course_lookup_list.append(rec)
    
course_lookup_list

[{'ID_Course__c': 'a004x000003VTav', 'Course_Code__c': 'BC-UIUX'},
 {'ID_Course__c': 'a004x000003VTbF', 'Course_Code__c': 'CIS-430'},
 {'ID_Course__c': 'a004x000003VTb5', 'Course_Code__c': 'CIS-405'},
 {'ID_Course__c': 'a004x000003VTbA', 'Course_Code__c': 'CIS-438'},
 {'ID_Course__c': 'a004x000003VTal', 'Course_Code__c': 'BC-DATAVIZ'},
 {'ID_Course__c': 'a004x000003VTb0', 'Course_Code__c': 'CIS-349'},
 {'ID_Course__c': 'a004x000003VTbK', 'Course_Code__c': 'CIS-435'},
 {'ID_Course__c': 'a004x000003VTaq', 'Course_Code__c': 'BC-WEBDEV'}]

In [16]:
course_lookup_df = pd.DataFrame(course_lookup_list)
course_lookup_df

Unnamed: 0,ID_Course__c,Course_Code__c
0,a004x000003VTav,BC-UIUX
1,a004x000003VTbF,CIS-430
2,a004x000003VTb5,CIS-405
3,a004x000003VTbA,CIS-438
4,a004x000003VTal,BC-DATAVIZ
5,a004x000003VTb0,CIS-349
6,a004x000003VTbK,CIS-435
7,a004x000003VTaq,BC-WEBDEV


In [17]:
class_lookup_list = []

# The `Name` column in the primary key in Salesforce objects
# The Salesforce query language is called SOQL 
data = sf.query_all_iter("SELECT Name, ID_Course__c, Section__c FROM Class__c")

for row in data:  # NEED TO UNDERSTAND WHY DART USED RECORDS HERE; PREVIOUSLY, WE HAD NOT
    rec = {
        'ID_Class__c': row['Name'], # this is a critical line of code
        'ID_Course__c': row['ID_Course__c'],
        'Section__c': row['Section__c']
    }
    class_lookup_list.append(rec)
    
# Temporary workaround to strip last 3 characters from ID_Course__c
for rec in class_lookup_list:
    rec['ID_Course__c'] = rec['ID_Course__c'][:-3]
    
class_lookup_list

[{'ID_Class__c': 'a014x000008WraB',
  'ID_Course__c': 'a004x000003VTal',
  'Section__c': 'GWDC201805DATA3'},
 {'ID_Class__c': 'a014x000008Wra6',
  'ID_Course__c': 'a004x000003VTal',
  'Section__c': 'GWU-ARL-DATA-PT-09-0'},
 {'ID_Class__c': 'a014x000008WraL',
  'ID_Course__c': 'a004x000003VTav',
  'Section__c': 'GWARL201905UIUX3'},
 {'ID_Class__c': 'a014x000008WraG',
  'ID_Course__c': 'a004x000003VTaq',
  'Section__c': 'GWARL201905WEB3'}]

In [18]:
class_lookup_df = pd.DataFrame(class_lookup_list)
class_lookup_df

Unnamed: 0,ID_Class__c,ID_Course__c,Section__c
0,a014x000008WraB,a004x000003VTal,GWDC201805DATA3
1,a014x000008Wra6,a004x000003VTal,GWU-ARL-DATA-PT-09-0
2,a014x000008WraL,a004x000003VTav,GWARL201905UIUX3
3,a014x000008WraG,a004x000003VTaq,GWARL201905WEB3


In [19]:
class_course_joint_lookup_df = pd.merge(course_lookup_df, class_lookup_df, on=['ID_Course__c'])
class_course_joint_lookup_df

Unnamed: 0,ID_Course__c,Course_Code__c,ID_Class__c,Section__c
0,a004x000003VTav,BC-UIUX,a014x000008WraL,GWARL201905UIUX3
1,a004x000003VTal,BC-DATAVIZ,a014x000008WraB,GWDC201805DATA3
2,a004x000003VTal,BC-DATAVIZ,a014x000008Wra6,GWU-ARL-DATA-PT-09-0
3,a004x000003VTaq,BC-WEBDEV,a014x000008WraG,GWARL201905WEB3


## Create Class Participant DataFrame

#### Class Participant table on MySQL
![Salesforce ETL Project - MySQL Table Class Participant](Images/MySQL_Table_Class_Participant.jpg)

In [20]:
# Query the `Class Participant` table from MySQL
query = '''
    SELECT 
        s.StudentID,        
        cl.ID_Course,
        co.CourseCode,
        cl.Section,
        cp.*
    FROM 
        classparticipant cp
        INNER JOIN student s
        ON cp.ID_Student = s.ID_Student
        INNER JOIN class cl
        ON cp.ID_Class = cl.ID_Class
        INNER JOIN course co
        ON cl.ID_Course = co.ID_Course

'''

class_participant_data_df = pd.read_sql(query, conn)
class_participant_data_df.head()

Unnamed: 0,StudentID,ID_Course,CourseCode,Section,ID_ClassParticipant,ID_Student,ID_Class,StartDate,EndDate
0,25004961,1,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,1,33,1,2020-09-16,
1,25003514,1,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2,34,1,2020-09-16,
2,25005833,1,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,3,35,1,2020-09-16,
3,25007334,1,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,4,62,1,2020-09-16,
4,25002589,1,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,5,36,1,2020-09-16,


In [21]:
class_participant_data_df.drop(['ID_Course', 'ID_ClassParticipant', 'ID_Student', 'ID_Class'], axis=1, inplace=True)

class_participant_data_df.head()

Unnamed: 0,StudentID,CourseCode,Section,StartDate,EndDate
0,25004961,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
1,25003514,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
2,25005833,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
3,25007334,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
4,25002589,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,


In [22]:
class_participant_data_df.rename(columns={
    'StudentID':'Student_ID__c',    
    'Section': 'Section__c',
    'CourseCode': 'Course_Code__c',
    'StartDate':'Start_Date__c',
    'EndDate':'End_Date__c'
}, inplace=True)

class_participant_data_df.head()

Unnamed: 0,Student_ID__c,Course_Code__c,Section__c,Start_Date__c,End_Date__c
0,25004961,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
1,25003514,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
2,25005833,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
3,25007334,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,
4,25002589,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,


## Join the Class Participant DataFrame with the Course & Class Joint Lookup Table
This join is necessary to successfully lookup the foreign key for the Class table 

In [23]:
class_participant_data_df1 = pd.merge(class_participant_data_df, class_course_joint_lookup_df, on=['Section__c', 'Course_Code__c'])

class_participant_data_df1.head()

Unnamed: 0,Student_ID__c,Course_Code__c,Section__c,Start_Date__c,End_Date__c,ID_Course__c,ID_Class__c
0,25004961,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6
1,25003514,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6
2,25005833,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6
3,25007334,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6
4,25002589,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6


## Join the Class Participant DataFrame with the Student Lookup Table
This join is necessary to successfully lookup the foreign key for the Student table 

In [24]:
class_participant_data_df2 = pd.merge(class_participant_data_df1, student_lookup_df, on=['Student_ID__c'])

class_participant_data_df2.head()

Unnamed: 0,Student_ID__c,Course_Code__c,Section__c,Start_Date__c,End_Date__c,ID_Course__c,ID_Class__c,ID_Student__c
0,25004961,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6,a054x0000010exJ
1,25003514,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6,a054x0000010esj
2,25005833,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6,a054x0000010erJ
3,25007334,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6,a054x0000010eoX
4,25002589,BC-DATAVIZ,GWU-ARL-DATA-PT-09-0,2020-09-16,,a004x000003VTal,a014x000008Wra6,a054x0000010exO


In [25]:
class_participant_data_df = class_participant_data_df2
class_participant_data_df.drop(['Student_ID__c', 'Course_Code__c', 'Section__c', 'ID_Course__c'], axis=1, inplace=True)

class_participant_data_df.head()

Unnamed: 0,Start_Date__c,End_Date__c,ID_Class__c,ID_Student__c
0,2020-09-16,,a014x000008Wra6,a054x0000010exJ
1,2020-09-16,,a014x000008Wra6,a054x0000010esj
2,2020-09-16,,a014x000008Wra6,a054x0000010erJ
3,2020-09-16,,a014x000008Wra6,a054x0000010eoX
4,2020-09-16,,a014x000008Wra6,a054x0000010exO


In [26]:
class_participant_data_records = class_participant_data_df.to_dict(orient='records')
class_participant_data_records

[{'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054x0000010exJ'},
 {'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054x0000010esj'},
 {'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054x0000010erJ'},
 {'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054x0000010eoX'},
 {'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054x0000010exO'},
 {'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054x0000010exT'},
 {'Start_Date__c': datetime.date(2020, 9, 16),
  'End_Date__c': None,
  'ID_Class__c': 'a014x000008Wra6',
  'ID_Student__c': 'a054

## Insert `Class Participant` Records into Salesforce

In [27]:
for rec in class_participant_data_records:
 
    record = {
        'ID_Student__c': rec['ID_Student__c'],
        'ID_Class__c': rec['ID_Class__c'],
        'Start_Date__c': str(rec['Start_Date__c']),
        'End_Date__c': rec['End_Date__c'],
    }
    
    try:
        sf.Class_Participant__C.create(record)
    except Exception as e:
        print(e)

In [28]:
# Bulk 
#sf.bulk.Class_Participant__c.insert(class_participant_data_records)

#### Class Participant object / table on Salesforce
![Salesforce ETL Project - Salesforce Object Class Participant](Images/SF_Object_Class_Participant.jpg)

## Example of Deleting Records

Select the IDs of the records first and then process the results.

Ultimately, you want a list of IDs in the end.


#### Delete Student object/table records in Salesforce databootcamp/gwsis

In [None]:
student_records = sf.query("SELECT Id FROM Student__c")
recs_to_delete = [{'Id': r['Id']} for r in student_records['records']]
recs_to_delete

In [None]:
# sf.bulk.Student__c.delete(recs_to_delete)

In [None]:
for rec in recs_to_delete:
    try:
        sf.Student__c.delete(rec['Id'])
    except Exception as e:
        print(e)

#### Delete Class Participant object/table records in Salesforce databootcamp/gwsis

In [None]:
class_participant_records = sf.query("SELECT Id FROM Class_Participant__c")
recs_to_delete = [{'Id': r['Id']} for r in class_participant_records['records']]
recs_to_delete

In [None]:
# sf.bulk.Class_Participant__c.delete(recs_to_delete)

In [None]:
for rec in recs_to_delete:
    try:
        sf.Class_Participant__c.delete(rec['Id'])
    except Exception as e:
        print(e)