Tyler Beaulieu  
Tim Paylor
DS5110, Fall 2025  
December 9, 2025  

<h4 style="text-align:center;">Final Project</h4>

In [10]:
#Initial Imports
import os
import pandas as pd
from dotenv import load_dotenv
from googleapiclient.discovery import build

### Phase 1: Look at file types.

In [None]:
#Load Google Drive API Key from .env file
load_dotenv()
API_KEY = os.getenv('GOOGLE_DRIVE_API_KEY')
if not API_KEY:
    raise ValueError("Please set GOOGLE_DRIVE_API_KEY")

#List files in Google Drive
service = build('drive', 'v3', developerKey=API_KEY)
folder_id = '12qMZKDEWn71JrN8Het5-a_NUWV0eE1ZF'

#Get the name of a folder by its ID
def get_folder_name(service, folder_id):
    try:
        folder = service.files().get(fileId=folder_id, fields='name').execute()
        return folder.get('name', '')
    except:
        return ''

#Return a list of dictionaries with file information for all files and folders
def list_files_recursively(service, folder_id, current_path="", parent_folder=""):
    all_files = []
    
    # List all items in the current folder
    results = service.files().list(
        q=f"'{folder_id}' in parents",
        pageSize=1000,
        fields="files(id, name, mimeType, size, modifiedTime)"
    ).execute()
    
    items = results.get('files', [])
    
    for item in items:
        file_info = {
            'id': item['id'],
            'name': item['name'],
            'mimeType': item['mimeType'],
            'size': item.get('size', 'N/A'),
            'modifiedTime': item.get('modifiedTime', 'N/A'),
            'path': current_path,
            'parent': parent_folder
        }
        
        # Check if it's a folder
        if item['mimeType'] == 'application/vnd.google-apps.folder':
            # Add folder to list
            all_files.append(file_info)
            
            # Recursively get files from subfolder
            subfolder_path = os.path.join(current_path, item['name'])
            subfolder_files = list_files_recursively(service, item['id'], subfolder_path, item['name'])
            all_files.extend(subfolder_files)
        else:
            # Add the file
            all_files.append(file_info)
    
    return all_files

# Get the root folder name for the path
parent_folder_name = get_folder_name(service, folder_id)

# Get all files recursively
file_list = list_files_recursively(service, folder_id, parent_folder_name, parent_folder_name)

# Convert to pandas DataFrame
df = pd.DataFrame(file_list)

# Create separate dataframes for folders and files
df_files = df[df['mimeType'] != 'application/vnd.google-apps.folder']
df_folders = df[df['mimeType'] == 'application/vnd.google-apps.folder']

# Display results
print(f"\nFound {len(df)} total items (files and folders)")
print(f"\n# Files: {len(df_files)}")
print(f"\n# Folders: {len(df_folders)}")


In [None]:
#We look at our folders from the dataset.
df_folders.head(10)

Unnamed: 0,id,name,mimeType,size,modifiedTime,path,parent
0,11zxmvcEq_l1d3GgszcV63Xnk5rmuJw7j,Engineering Documents,application/vnd.google-apps.folder,,2025-11-19T21:30:03.158Z,MPA,MPA
1,1Xw0LX4fdiEcS4rgt-cINK5TonnnNpOHG,2023 Reefer Yard Relocation,application/vnd.google-apps.folder,,2025-11-19T21:30:05.299Z,MPA/Engineering Documents,Engineering Documents
4,1xwHiq1adtOcWU5eX7rrPJNg4KdTViCb4,AS BUILTS - USE THIS FOLDER,application/vnd.google-apps.folder,,2025-11-19T21:30:05.272Z,MPA/Engineering Documents,Engineering Documents
5,1QiK8SGhrziEkduFONVkX1B0JqgKB2r79,IMT H2O SEWER Asbuilts C-08,application/vnd.google-apps.folder,,2025-11-19T21:30:06.287Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,AS BUILTS - USE THIS FOLDER
13,1oAGBfPM8VSmCyy81a3piWGAA2jB1xf1z,IMT GAS Asbuilts C-08,application/vnd.google-apps.folder,,2025-11-19T21:30:06.285Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,AS BUILTS - USE THIS FOLDER
21,1qTrvSkK3sRuB-rXCGnSl4fIgEWIygx7O,IMT AsBuilt RR 12_2015,application/vnd.google-apps.folder,,2025-11-19T21:30:06.127Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,AS BUILTS - USE THIS FOLDER
22,1U70yEKfLWcdV2Mu9EHngnHn7RQ0JM4Tz,Railroad Redline pdfs,application/vnd.google-apps.folder,,2025-11-19T21:30:06.759Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT AsBuilt RR 12_2015
54,1ucu39hCx99VlOauWxc8zNEmjYZvDFv_1,IMT AsBuilt Fencing 12_2015,application/vnd.google-apps.folder,,2025-11-19T21:30:06.124Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,AS BUILTS - USE THIS FOLDER
55,1B2m3D-GIlJW4TvnyQmASQFllFjQPB6xL,Redline Fencing pdf,application/vnd.google-apps.folder,,2025-11-19T21:30:06.763Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT AsBuilt Fencing 12_2015
72,1y_v1wVz886na3MwQr8vAE5RiqgvfNQFM,IMT AsBuilt Electrical 12_2015,application/vnd.google-apps.folder,,2025-11-19T21:30:05.743Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,AS BUILTS - USE THIS FOLDER


In [None]:
#We look at our files from the dataset.
df_files.head(10)

Unnamed: 0,id,name,mimeType,size,modifiedTime,path,parent
2,15WbZHuFUhe79g-cm7sQp-0v5EZSH0NMQ,E2X93900_Draft 60% Review Set_2023.01.20.pdf,application/pdf,12869342,2025-11-19T18:58:43.000Z,MPA/Engineering Documents/2023 Reefer Yard Rel...,2023 Reefer Yard Relocation
3,1kPD2Jg3TSk1mfqYGqVozfVUqvEZvZvBJ,E2X93900_60% Drawings_20230224.pdf,application/pdf,12652508,2025-11-19T18:58:43.000Z,MPA/Engineering Documents/2023 Reefer Yard Rel...,2023 Reefer Yard Relocation
6,1gI7VVGbaQjP-6T0xB-OWOdwW6Ae2FCxT,Image C08 site plan.tif,image/tiff,1209995,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
7,16gh23fs6ID6WSIxk4nrkbJO-1CM_a0HZ,IMT AsBuilts H2O and Sanitary.pdf,application/pdf,4843831,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
8,1E7TYkQjsML1ckSFMJE-oThzGYtBhI7uH,IMT AsBuilts H2O and Sanitary.bak,application/octet-stream,7165614,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
9,1dJST5hK9G_4QjAoxy2X6fWIg445SI-fF,IMT AsBuilts H2O and Sanitary.ini,application/octet-stream,34,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
10,1INTRSX59xNSczsQWpZ7zAsCBXauCQIIG,H2O and Sanitary Asbuilt Points.csv,text/csv,3623,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
11,1RnnmegRZIrUbrKpsgjtkkJ8OiU5bu4xm,IMT AsBuilts H2O and Sanitary.dwg,image/vnd.dwg,7493239,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
12,1kNv8fTffolaPOklDiOXW31DhDn4mdh6n,Shaw Brothers logo 1x1.jpg,image/jpeg,18601,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08
14,1TRI-GgGQOKSIw_uiyJyPc85SHN6l-jGW,sbc IMT GAS Asbuilts.pdf,application/pdf,4873136,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT GAS Asbuilts C-08


In [None]:
#Look at the top file types
df_files.groupby('mimeType')['id'].count().sort_values(ascending=False)

mimeType
application/msword                                                           2820
application/pdf                                                              1739
image/jpeg                                                                   1497
application/vnd.ms-excel                                                      698
application/vnd.openxmlformats-officedocument.wordprocessingml.document       533
application/vnd.ms-powerpoint                                                 345
image/vnd.dwg                                                                 251
application/octet-stream                                                      184
application/vnd.openxmlformats-officedocument.presentationml.presentation      94
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet              85
application/vnd.ms-outlook                                                     65
image/tiff                                                                     57
text/cs

For our files, lets divide type and subtype from mimeType

In [None]:
df_files[['type', 'subtype']] = df['mimeType'].str.split('/', expand=True)
df_files.head(10)

# Export results to csv
df_files.to_csv('df_files.csv', index = False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_files[['type', 'subtype']] = df['mimeType'].str.split('/', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_files[['type', 'subtype']] = df['mimeType'].str.split('/', expand=True)


Unnamed: 0,id,name,mimeType,size,modifiedTime,path,parent,type,subtype
2,15WbZHuFUhe79g-cm7sQp-0v5EZSH0NMQ,E2X93900_Draft 60% Review Set_2023.01.20.pdf,application/pdf,12869342,2025-11-19T18:58:43.000Z,MPA/Engineering Documents/2023 Reefer Yard Rel...,2023 Reefer Yard Relocation,application,pdf
3,1kPD2Jg3TSk1mfqYGqVozfVUqvEZvZvBJ,E2X93900_60% Drawings_20230224.pdf,application/pdf,12652508,2025-11-19T18:58:43.000Z,MPA/Engineering Documents/2023 Reefer Yard Rel...,2023 Reefer Yard Relocation,application,pdf
6,1gI7VVGbaQjP-6T0xB-OWOdwW6Ae2FCxT,Image C08 site plan.tif,image/tiff,1209995,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,image,tiff
7,16gh23fs6ID6WSIxk4nrkbJO-1CM_a0HZ,IMT AsBuilts H2O and Sanitary.pdf,application/pdf,4843831,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,application,pdf
8,1E7TYkQjsML1ckSFMJE-oThzGYtBhI7uH,IMT AsBuilts H2O and Sanitary.bak,application/octet-stream,7165614,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,application,octet-stream
9,1dJST5hK9G_4QjAoxy2X6fWIg445SI-fF,IMT AsBuilts H2O and Sanitary.ini,application/octet-stream,34,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,application,octet-stream
10,1INTRSX59xNSczsQWpZ7zAsCBXauCQIIG,H2O and Sanitary Asbuilt Points.csv,text/csv,3623,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,text,csv
11,1RnnmegRZIrUbrKpsgjtkkJ8OiU5bu4xm,IMT AsBuilts H2O and Sanitary.dwg,image/vnd.dwg,7493239,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,image,vnd.dwg
12,1kNv8fTffolaPOklDiOXW31DhDn4mdh6n,Shaw Brothers logo 1x1.jpg,image/jpeg,18601,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT H2O SEWER Asbuilts C-08,image,jpeg
14,1TRI-GgGQOKSIw_uiyJyPc85SHN6l-jGW,sbc IMT GAS Asbuilts.pdf,application/pdf,4873136,2025-11-19T18:58:47.000Z,MPA/Engineering Documents/AS BUILTS - USE THIS...,IMT GAS Asbuilts C-08,application,pdf


Lets look at how many files are in each folder.

Tim's addition to tyler_notes.ipynb 11/28@11:00am
import random_sample
(after df_files is split into type,subtype:)
call df_sample = random_sample(df_files,id,50)
print df_sample

In [None]:
from random_sample import random_sample

df_sample = random_sample(df_files,'pdf',50)
print(df_sample)


                                   id  \
0   1DbPI0NoETR0RWWbY4Wt7UVqEDHdtx1Wh   
1   1bXWFpL_96xYwDbdHNQfFNc8KK6bOYg6s   
2   1alxBVIOpDxrnc6WylvxEJEJCX4q9fRh9   
3   1a1ZByMVQ5VjUOIXlqwcB0s2M-rBBcFyo   
4   1JNhBzg3rxjArDBmd9LXyXSZ3pfYRqrBC   
5   1h1sDz-ACpllSwe8XLee8aK2uvfzGE-r7   
6   1oZ6v4IfPePzWNyjuzNq05grSggahHLio   
7   1pD3s5fppqsYgF3jBiHz0P_PuUw_z9C2b   
8   1bWjJo3oa0fZFSSUagEnAG0m_IwZ_rGQI   
9   1oGqY_3QP-lMhs_Bst72-l0OsCaNTYgqn   
10  19pdvzG8MoW4C5dwOq9jYqoraRwelZR8R   
11  16ORr4IdJvBrRXwsRrUZE5cjDhPD2bi1k   
12  15WbZHuFUhe79g-cm7sQp-0v5EZSH0NMQ   
13  1pZtQnNPhG_j_NGcOk_jK_reTjzhyVNGv   
14  1PmzjoBTV_exHwjOm9kYr9w2mibnW7Nw-   
15  1Py4r13Xb1U8tJytkWMe-1g1dPFbESRlf   
16  1Bjmn7RVrHi5scKpfqYUUuYRAseZvXJ_h   
17  19I6LAVL8jrR6nrBjxJfyuTVhVcjCHLaZ   
18  18LiIoVhDXqM7yTT6fVMOInLuxpMsO5fg   
19  1TRI-GgGQOKSIw_uiyJyPc85SHN6l-jGW   
20  1WEK_RinzBIRxZOSraHZTa2mBtRJjgCcV   
21  1kfhX7Z1G4Nybt1DOEHnSrdP25Bo_NTvo   
22  1xIfoJs6gxUZNLqUdWHnJF23AyESSYpu7   
23  14F_Dl6jvTq8

WIP - sample_corpus(df_sample) method will do the following:
    input:
        df_sample - df output by random_sample()
    output:
        df_corpus
            - in feature vector form
            - columns = terms in corpus
            - rows = documents, IDF of each term


In [None]:
from sample_corpus import sample_corpus

df_corpus = sample_corpus(df_sample)




['1DbPI0NoETR0RWWbY4Wt7UVqEDHdtx1Wh', '1bXWFpL_96xYwDbdHNQfFNc8KK6bOYg6s', '1alxBVIOpDxrnc6WylvxEJEJCX4q9fRh9', '1a1ZByMVQ5VjUOIXlqwcB0s2M-rBBcFyo', '1JNhBzg3rxjArDBmd9LXyXSZ3pfYRqrBC', '1h1sDz-ACpllSwe8XLee8aK2uvfzGE-r7', '1oZ6v4IfPePzWNyjuzNq05grSggahHLio', '1pD3s5fppqsYgF3jBiHz0P_PuUw_z9C2b', '1bWjJo3oa0fZFSSUagEnAG0m_IwZ_rGQI', '1oGqY_3QP-lMhs_Bst72-l0OsCaNTYgqn', '19pdvzG8MoW4C5dwOq9jYqoraRwelZR8R', '16ORr4IdJvBrRXwsRrUZE5cjDhPD2bi1k', '15WbZHuFUhe79g-cm7sQp-0v5EZSH0NMQ', '1pZtQnNPhG_j_NGcOk_jK_reTjzhyVNGv', '1PmzjoBTV_exHwjOm9kYr9w2mibnW7Nw-', '1Py4r13Xb1U8tJytkWMe-1g1dPFbESRlf', '1Bjmn7RVrHi5scKpfqYUUuYRAseZvXJ_h', '19I6LAVL8jrR6nrBjxJfyuTVhVcjCHLaZ', '18LiIoVhDXqM7yTT6fVMOInLuxpMsO5fg', '1TRI-GgGQOKSIw_uiyJyPc85SHN6l-jGW', '1WEK_RinzBIRxZOSraHZTa2mBtRJjgCcV', '1kfhX7Z1G4Nybt1DOEHnSrdP25Bo_NTvo', '1xIfoJs6gxUZNLqUdWHnJF23AyESSYpu7', '14F_Dl6jvTq81pyqd2j6Y1RyLH8T5-M18', '1oJh6PMXLKt04HHVj35VtR1chECBO-2N6', '1uTNZa0wKWcjcsT7GKmv3ZqkNo3A_Rzq8', '1qq7O-CK46XovnmIHZtFDuEYZjwVkwa0E', 