In [None]:
## Visual Flow for DataShip Pipeline

# 1. Authenticate (Colab, Drive, Sheets)
        # ↓
# 2. Access Validation (Permissions, Owner Check)
        # ↓
# 3. Extract Raw Data (Sheets, Drive)
        # ↓
# 4. Clean Data (Missing values, duplicates, wrong formats)
        # ↓
# 5. Transform & Aggregate (KPI Calculations, Rollups, Metrics)
        # ↓
# 6. Output Final Data (Google Sheets, Looker Dashboard)
        # ↓
# 7. Automate Everything (Schedule / Notify / Refresh)

## Install Necessary Packages and Libraries

In [None]:
!pip install emoji
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
# import Libraries
import pandas as pd
import numpy as np
import re

In [15]:
# Google Colab & Google API imports

''' These libraries are needed to authenticate with Google and interact with Google Sheets and Google Drive. '''

from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.auth.transport.requests import Request
import gspread
from gspread.exceptions import SpreadsheetNotFound
from google.oauth2.credentials import Credentials
from google.auth import default
from gspread.exceptions import SpreadsheetNotFound, WorksheetNotFound


## Authentication for Google Services and Connect to Data Sources


→ This code is handling Google Authentication inside a Google Colab notebook to allow the user to access Google Drive folder for project (e.g. 12 - Dataship Client Success Reporting) and Google Sheets data (e.g. 00 - Mentee Data) programmatically.

This is common in projects like DataShip Client Success Reporting, where a lot of your data is stored in Google Sheets/Drive and you want to automate interaction with it.

In [None]:
# Authentication for Google Services setup (specifically Google Drive and Google Sheets)

# 1. Authenticate the User in Google Colab
auth.authenticate_user()

# 2. Get Google Credentials
import google.auth
creds, _ = google.auth.default()

# 3. Authorize Google Sheets API
gc = gspread.authorize(creds) #Interact with Google Sheets like a database (Read, Write, Update).


# Initialize the Drive API client
from google.colab import auth
from googleapiclient.discovery import build

#Initialize Google Drive API
service = build('drive', 'v3')


# Get authenticated user email properly in Colab
about = service.about().get(fields="user").execute()
user_email = about['user']['emailAddress']

print(f"Authenticated User Email: {user_email}")


Authenticated User Email: abubakaralfaki@gmail.com


## Checking folder permissions

In [None]:
def check_folder_access(service, folder_id, user_email):
    """
    Check folder owner, detailed permissions, and whether authenticated user has access.
    """

    try:
        # Get Folder Owner
        folder_metadata = service.files().get(fileId=folder_id, fields='owners').execute()
        owners = [owner['emailAddress'] for owner in folder_metadata.get('owners', [])]

        # Get Permissions
        permissions = service.permissions().list(fileId=folder_id, fields='permissions').execute()

        print("\nFolder Owner(s):")
        for owner_email in owners:
            print(owner_email)

        print("\nPermissions List (Type & Details):")
        users_with_access = []
        for permission in permissions.get('permissions', []):
            print(f"Type: {permission.get('type')} | Role: {permission.get('role')} | Email: {permission.get('emailAddress', 'N/A')}")
            # Grab emails only if it's a user
            if permission.get('type') == 'user' and 'emailAddress' in permission:
                users_with_access.append(permission['emailAddress'])

        # Check if user is Owner or Explicitly Shared
        if user_email in users_with_access or user_email in owners:
            print(f"\nAuthenticated user {user_email} has direct access to the folder.")
        else:
            print(f"\nAuthenticated user {user_email} might have indirect access (link/domain share), but is not directly listed.")

        return {'owners': owners, 'users_with_access': users_with_access, 'permissions': permissions.get('permissions', [])}

    except Exception as e:
        print(f"An error occurred: {e}")
        return {}


In [None]:
folder_id = '1r2EiMWNFVpV9WNsNwvu79WRuAHmzxKNs'

# Run acceess Check
users = check_folder_access(service, folder_id, user_email)


Folder Owner(s):
jaretandre1869@gmail.com

Permissions List (Type & Details):
Type: user | Role: writer | Email: imrelor@gmail.com
Type: user | Role: commenter | Email: daniel.r.marino1@gmail.com
Type: user | Role: commenter | Email: stern.elliot@gmail.com
Type: user | Role: writer | Email: alpha@avebagroup.com
Type: user | Role: writer | Email: vaibhavjain38@gmail.com
Type: user | Role: writer | Email: HanadSharmarke@gmail.com
Type: user | Role: writer | Email: jda.digital.corp.ltd@gmail.com
Type: user | Role: writer | Email: oluwatoke50@gmail.com
Type: user | Role: commenter | Email: amnaa.msvirtuals@gmail.com
Type: user | Role: commenter | Email: linetbaraka@gmail.com
Type: user | Role: writer | Email: jaretdandre@gmail.com
Type: user | Role: writer | Email: thealphadigitalph@gmail.com
Type: user | Role: commenter | Email: kushluthra15@gmail.com
Type: user | Role: writer | Email: abubakaralfaki@gmail.com
Type: user | Role: owner | Email: Jaretandre1869@gmail.com
Type: user | Role: 

# Extract Google Sheet data (00 - Mentee Data)

In [16]:
def replicate_google_sheet(gc, drive_service, original_sheet_id, new_sheet_name, target_folder_id):
    """
    Replicate a Google Sheet from a source to a clean working copy.
    """

    try:
        # Check if target sheet exists
        file = drive_service.files().get(fileId=gc.open(new_sheet_name).id, fields='trashed').execute()
        if file.get('trashed'):
            print(f"The sheet '{new_sheet_name}' is in the trash. Creating a new one.")
            new_sheet = gc.create(new_sheet_name)

        else:
            new_sheet = gc.open(new_sheet_name)
            print(f"The sheet '{new_sheet_name}' already exists. Updating its content.")

    except SpreadsheetNotFound:
        new_sheet = gc.create(new_sheet_name)
        print(f"The sheet '{new_sheet_name}' has been created.")

    # Open original sheet
    original_sheet = gc.open_by_key(original_sheet_id)

    for ws in original_sheet.worksheets():
        try:
            existing_ws = new_sheet.worksheet(ws.title)
            print(f"Worksheet '{ws.title}' found in '{new_sheet_name}'. Updating its content.")
            existing_ws.clear()
        except WorksheetNotFound:
            existing_ws = new_sheet.add_worksheet(title=ws.title, rows=ws.row_count, cols=ws.col_count)
            print(f"Worksheet '{ws.title}' created in '{new_sheet_name}'.")

        # Update values properly to avoid DeprecationWarning
        values = ws.get_all_values()
        existing_ws.update(range_name='A1', values=values)

    # Move to target folder
    file_metadata = drive_service.files().get(fileId=new_sheet.id, fields='parents').execute()
    current_parents = ",".join(file_metadata.get('parents', []))

    drive_service.files().update(
        fileId=new_sheet.id,
        addParents=target_folder_id,
        removeParents=current_parents
    ).execute()

    print(f"The sheet '{new_sheet_name}' is up-to-date and in the specified folder.")

    # Delete 'Sheet1' if it exists
    try:
        sheet1 = new_sheet.worksheet('Sheet1')
        new_sheet.del_worksheet(sheet1)
        print("Sheet1 deleted successfully.")
    except WorksheetNotFound:
        print("Sheet1 not found. No changes made.")


In [17]:
replicate_google_sheet(gc, service,
                       original_sheet_id='1hisTEqOD4VBJFB06kxnAHcCU02DuQ5moPcARELcTxV8',
                       new_sheet_name='mentee_data_copy',
                       target_folder_id='1QEDqJYty4_ZAPBRGrhYbdODXHTPYcO34')

The sheet 'mentee_data_copy' already exists. Updating its content.
Worksheet 'Overview' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Client Overview' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Client Job Search Data' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Booked Calls' found in 'mentee_data_copy'. Updating its content.
Worksheet 'To deleteMentor Booked Calls' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Client Module Progress' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Module Feedback' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Calendly Feedback' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Call Feedback' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Pulse Check Experiment' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Graduation Survey' found in 'mentee_data_copy'. Updating its content.
Worksheet 'Cancellation Survery' found