In [1]:
import numpy as np
import pandas as pd
from pandas import ExcelFile
from pandas import ExcelWriter
import requests, zipfile, io
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine, MetaData, Table, Column, delete, insert, select, func, sql
from sqlalchemy.types import SmallInteger, Integer, String, Float, NVARCHAR
from datetime import datetime
import pprint
import urllib
import urllib.request
import copy
import os
import urllib3
urllib3.disable_warnings()

# autobuildicdb NPD - Download and QC Core Photo data

In [2]:
# IC database folder
dbdir = 'C:\ICData\Test3'

# Output folder
outdir = '{}\core_photo_jpgs'.format(dbdir)

# Create outdir folder within IC database folder
if not os.path.exists(outdir):
    os.mkdir(outdir)

## Download Core Photos to file

In [3]:
# Main script outputs three files:
    # wellbore_core_photo_ERRONEOUS_withURL.csv (erroneous 'Core photo title' columns)
    # wellbore_core_photo_withURL.csv (final data with URLs)
    # wellbore_core_photo.csv (final data for IC)

### Core Photos (one folder per well)

In [4]:
# df_core_photo_withURL

df = pd.read_csv('{}\output_data\wellbore_core_photo_withURL.csv'.format(dbdir))
df.head()

Unnamed: 0,Well,Core sample number,Core photo title,Core photo URL,NPDID wellbore,Top depth,Base depth,Unit,Folder,Legend
0,1/2-1,1,"10208,10228,ft",https://factpages.npd.no/pbl/core_photo_jpgs/1...,1382,3111.3984,3117.4944,ft,1_2-1,.\core_photo_jpgs\1_2-1\1382_01_10208-10228ft.jpg
1,1/2-1,2,"19228,10262,ft",https://factpages.npd.no/pbl/core_photo_jpgs/1...,1382,5860.6944,3127.8576,ft,1_2-1,.\core_photo_jpgs\1_2-1\1382_02_19228-10262ft.jpg
2,1/2-1,3,"10262,10277,ft",https://factpages.npd.no/pbl/core_photo_jpgs/1...,1382,3127.8576,3132.4296,ft,1_2-1,.\core_photo_jpgs\1_2-1\1382_03_10262-10277ft.jpg
3,1/2-1,4,"10277,10292,ft",https://factpages.npd.no/pbl/core_photo_jpgs/1...,1382,3132.4296,3137.0016,ft,1_2-1,.\core_photo_jpgs\1_2-1\1382_04_10277-10292ft.jpg
4,1/2-1,5,"10292,10307,ft",https://factpages.npd.no/pbl/core_photo_jpgs/1...,1382,3137.0016,3141.5736,ft,1_2-1,.\core_photo_jpgs\1_2-1\1382_05_10292-10307ft.jpg


In [5]:
# Create subset of wells for which to save Core Photos

filt = (df['Well'].str.startswith('1/3'))
df = df[filt]
df

Unnamed: 0,Well,Core sample number,Core photo title,Core photo URL,NPDID wellbore,Top depth,Base depth,Unit,Folder,Legend
12,1/3-3,1,"4129,4135,m",https://factpages.npd.no/pbl/core_photo_jpgs/8...,87,4129.0,4135.0,m,1_3-3,.\core_photo_jpgs\1_3-3\87_01_4129-4135m.jpg
13,1/3-3,2,"4135,4141,m",https://factpages.npd.no/pbl/core_photo_jpgs/8...,87,4135.0,4141.0,m,1_3-3,.\core_photo_jpgs\1_3-3\87_02_4135-4141m.jpg
14,1/3-3,3,"4141,4148,m",https://factpages.npd.no/pbl/core_photo_jpgs/8...,87,4141.0,4148.0,m,1_3-3,.\core_photo_jpgs\1_3-3\87_03_4141-4148m.jpg
15,1/3-3,4,"4147,4186,m",https://factpages.npd.no/pbl/core_photo_jpgs/8...,87,4147.0,4186.0,m,1_3-3,.\core_photo_jpgs\1_3-3\87_04_4147-4186m.jpg
16,1/3-3,5,"4186,4192,m",https://factpages.npd.no/pbl/core_photo_jpgs/8...,87,4186.0,4192.0,m,1_3-3,.\core_photo_jpgs\1_3-3\87_05_4186-4192m.jpg
...,...,...,...,...,...,...,...,...,...,...
124,1/3-9 S,17,"4353,4358,m",https://factpages.npd.no/pbl/core_photo_jpgs/3...,3362,4353.0,4358.0,m,1_3-9 S,.\core_photo_jpgs\1_3-9 S\3362_17_4353-4358m.jpg
125,1/3-9 S,18,"4358,4363,m",https://factpages.npd.no/pbl/core_photo_jpgs/3...,3362,4358.0,4363.0,m,1_3-9 S,.\core_photo_jpgs\1_3-9 S\3362_18_4358-4363m.jpg
126,1/3-9 S,20,"4368,4373,m",https://factpages.npd.no/pbl/core_photo_jpgs/3...,3362,4368.0,4373.0,m,1_3-9 S,.\core_photo_jpgs\1_3-9 S\3362_20_4368-4373m.jpg
127,1/3-9 S,21,"4373,4378,m",https://factpages.npd.no/pbl/core_photo_jpgs/3...,3362,4373.0,4378.0,m,1_3-9 S,.\core_photo_jpgs\1_3-9 S\3362_21_4373-4378m.jpg


In [6]:
# See https://stackabuse.com/download-files-with-python/
# Using the request Module

# Would also be useful to create folders for each wellbore

def save_core_photo():
    
    for index, row in df.iterrows(): 
        # Using filterered dataframe for speed
        
        url = row['Core photo URL']
        filename = url.split('/')[-1]

        print('Beginning file download with requests: ', url)
        r = requests.get(url, verify=False)
        # See https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
        
        folder_per_well = row['Well'].replace('/', '_')
        
        filedir = outdir + '\\' + folder_per_well
        
        if not os.path.exists(filedir):
            os.mkdir(filedir)
        
        with open('{}\{}'.format(filedir, filename), 'wb') as f:
            f.write(r.content)
        print('Saved to {}\{}'.format(filedir, filename))

        # Retrieve HTTP meta-data
        #print(r.status_code)
        #print(r.headers['content-type'])
        #print(r.encoding)
        
save_core_photo()

Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/87_01_4129-4135m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-3\87_01_4129-4135m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/87_02_4135-4141m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-3\87_02_4135-4141m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/87_03_4141-4148m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-3\87_03_4141-4148m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/87_04_4147-4186m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-3\87_04_4147-4186m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/87_05_4186-4192m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-3\87_05_4186-4192m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/87_06_4192-4198m.jpg
Saved to C:\ICData\Test3

Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_032_3215-3216m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2505_033_3216-3217m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_033_3216-3217m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2505_034_3217-3218m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_034_3217-3218m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2505_035_3218-3219m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_035_3218-3219m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2505_036_3219-3220m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_036_3219-3220m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2505_037_3220-3221m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_037_3220-3221m.jpg
Beginning file download with 

Saved to C:\ICData\Test3\core_photo_jpgs\1_3-7\2505_117_3244-3245m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2829_01_5024-5029m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-8\2829_01_5024-5029m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2829_02_5029-5034m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-8\2829_02_5029-5034m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/2829_03_5034-5039m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-8\2829_03_5034-5039m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/3362_01_4273-4278m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-9 S\3362_01_4273-4278m.jpg
Beginning file download with requests:  https://factpages.npd.no/pbl/core_photo_jpgs/3362_02_4278-4283m.jpg
Saved to C:\ICData\Test3\core_photo_jpgs\1_3-9 S\3362_02_4278-4283m.jpg
Beginning file download with reques

### Erroneous Core Photo Titles

In [8]:
# Import pre-conditioned file with Core Photo URLs

df = df_core_photo_ERRONEOUS_withURL

df = pd.read_csv('{}\output_data\wellbore_core_photo_ERRONEOUS_withURL.csv'.format(dbdir))
df

NameError: name 'df_core_photo_ERRONEOUS_withURL' is not defined

In [None]:
# See https://stackabuse.com/download-files-with-python/
# Using the request Module

# Would also be useful to create folders for each wellbore

def save_core_photo():
    
    for index, row in df.iterrows(): 
        # Using filterered dataframe for speed
        
        url = row['Core photo URL']
        filename = url.split('/')[-1]

        print('Beginning file download with requests: ', url)
        r = requests.get(url, verify=False)
        # See https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
        
        folder_per_well = row['Wellbore'].replace('/', '_')
        
        
        if not os.path.exists(outdir + '\Erroneous titles\\'):
            os.mkdir(outdir + '\Erroneous titles\\')
        
        filedir = outdir + '\Erroneous titles\\' + folder_per_well
        
        if not os.path.exists(filedir):
            os.mkdir(filedir)
        
        with open('{}\{}'.format(filedir, filename), 'wb') as f:
            f.write(r.content)
        print('Saved to {}\{}'.format(filedir, filename))

        # Retrieve HTTP meta-data
        #print(r.status_code)
        #print(r.headers['content-type'])
        #print(r.encoding)
        
save_core_photo()

In [None]:
# 

def save_core_photo():
    
    for index, row in df.iterrows(): 
        # Using filterered dataframe for speed
        
        url = row['Core photo URL']
        filename = url.split('/')[-1]

        print('Beginning file download with requests: ', url)
        r = requests.get(url, verify=False)
        
        with open('{}\{}'.format(outdir, filename), 'wb') as f:
            f.write(r.content)
        print('Saved to: {}\{}'.format(outdir, filename))

        # Retrieve HTTP meta-data
        print(r.status_code)
        print(r.headers['content-type'])
        print(r.encoding)
        
save_core_photo()

## Check for overlap in Core Photo data

In [None]:
# IMPORTANT: This reads the pre-conditioned file "wellbore_core_photo.csv", generated by main script.

df_core_photo  = pd.read_csv('{}\output_data\wellbore_core_photo.csv'.format(dbdir))
print(df_core_photo.shape)
df_core_photo.head()

In [None]:
df_core_photo.isna().sum()

In [None]:
# Note overlapping depths in Core Photos
# For example in well 15/9-1

filt = df_core_photo['Well'] == '15/9-1'
df_core_photo[filt].head()

In [None]:
# Calculate overlap
# Zero overlap for first test in each well, no negative overlaps (representing gaps)
# Iterate over DataFrame rows as (index, Series) pairs.

for index, row in df_core_photo[1:].iterrows():
    
    current_row = df_core_photo.loc[index]
    last_row = df_core_photo.iloc[df_core_photo.index.get_loc(index) - 1]
    
    # Zero overlap for first test in each well
    if current_row['Well'] != last_row['Well']:
        df_core_photo.loc[index, 'Overlap'] = 0
    
    else:
        # Difference between base of last row and top of current row
        if current_row['Top depth'] < last_row['Base depth']:
            df_core_photo.loc[index, 'Overlap'] = last_row['Base depth'] - current_row['Top depth']
        else:
            df_core_photo.loc[index, 'Overlap'] = 0 

df_core_photo_overlap = df_core_photo.round(1)

In [None]:
# Where overlap > 0
df_core_photo_overlap[df_core_photo_overlap['Overlap'] > 0]

In [None]:
# Output to file
file = "output data/calc_corephoto_overlap_and_equal.xlsx"
df_core_photo_overlap.to_excel(file, index=False)
pd.read_excel(file).head(20)

In [None]:
# Count wells with overlap (say, over 10 m)

num_unique = len(df_core_photo_overlap['Well'].unique())

num_unique_problem = len(df_core_photo_overlap['Well'][df_core_photo_overlap['Overlap'] > 10].unique())

print('{} out of {} wells have overlap'.format(num_unique_problem, num_unique))

In [None]:
df_core_photo_overlap['Well'][df_core_photo_overlap['Overlap'] > 10].unique()

In [None]:
# Re-use code to find wells where depth values are same for many records
# i.e. where core photo depths do not increasy incrementally (and maybe share depth of full cored interval)
# e.g well 6507/7-4

filt = df_core_photo_overlap['Well'] == '6507/7-4'
df_core_photo_overlap[filt].head()

In [None]:
df_core_photo_overlap.sort_values('Overlap', ascending=False).head(30)

In [None]:
# 

for index, row in df_core_photo_overlap[1:].iterrows():
    
    current_row = df_core_photo_overlap.loc[index]
    last_row = df_core_photo_overlap.iloc[df_core_photo_overlap.index.get_loc(index) - 1]
    
    # Zero overlap for first test in each well
    if current_row['Well'] != last_row['Well']:
        df_core_photo_overlap.loc[index, 'Equal'] = '0'
    
    else:
        # If top of current row and top of last row are equal, then 1
        if current_row['Top depth'] == last_row['Top depth']:
            df_core_photo_overlap.loc[index, 'Equal'] = '1'
        else:
            df_core_photo_overlap.loc[index, 'Equal'] = '0'

df_core_photo_overlap_and_equal = df_core_photo_overlap.round(1)

In [None]:
# Output to file
file = "output data/calc_corephoto_overlap_and_equal.xlsx"
df_core_photo_overlap_and_equal.to_excel(file, index=False)
pd.read_excel(file).head(20)

In [None]:
filt = df_core_photo_overlap_and_equal['Well'] == '6507/7-4'
df_core_photo_overlap_and_equal[filt].head(10)

In [None]:
# Count number of equal rows per well
# E.g. Well 2/7-20 has 58 rows showing the same depths

test = df_core_photo_overlap_and_equal.groupby(['Well', 'Equal']).count()
test.head(50)

In [None]:
# Output to file
file = "output data/test.xlsx"
test.to_excel(file, index=False)
pd.read_excel(file).head(20)