In [1]:
from pathlib import Path
import sqlalchemy as sa
import pandas as pd

db_path = Path("./eqr.db")
engine = sa.create_engine(f"sqlite:///{db_path}")

## Contracts

In [4]:
with engine.connect() as conn:
    contracts = pd.read_sql_table("contracts", conn)

ValueError: could not convert string to float: 'M'

In [None]:
contracts.info()

In [None]:
contracts.head()

In [18]:
contracts.contract_unique_id.value_counts().describe()

count    20079.000000
mean         8.841576
std        101.556610
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max      10288.000000
Name: contract_unique_id, dtype: float64

In [35]:
contracts.contract_unique_id.value_counts()

C4999      10288
C1          8188
C3          3679
C2          1760
C744         988
           ...  
C40336         1
C40337         1
C40338         1
C40339         1
C767208        1
Name: contract_unique_id, Length: 20079, dtype: int64

In [28]:
pip install -U pandas-profiling[notebook]

Collecting pandas-profiling[notebook]
  Downloading pandas_profiling-3.2.0-py2.py3-none-any.whl (262 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.6/262.6 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hCollecting visions[type_image_path]==0.7.4
  Downloading visions-0.7.4-py3-none-any.whl (102 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting tangled-up-in-unicode==0.2.0
  Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting missingno>=0.4.2
  Downloading missingno-0.5.1-py3-none-any.whl (8.7 kB)
Collecting phik>=0.11.1
  Downloading phik-0.12.2-cp310-cp310-macosx_10_13_x86_64.whl (812 kB)
[2K     [38;2;114;156;31m━━━━

In [30]:
from pandas_profiling import ProfileReport

df = contracts.query("contract_unique_id == 'C4999'")

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
unique_id_counts = contracts.contract_unique_id.value_counts()
unique_id_counts[unique_id_counts ==  2]

C10107        2
C6068         2
C10163        2
C10100        2
C10162        2
             ..
C200118183    2
C200115476    2
C200115482    2
C200115477    2
C200121240    2
Name: contract_unique_id, Length: 3024, dtype: int64

In [52]:
df = contracts.query("contract_unique_id == 'C6068'")
df

Unnamed: 0,contract_unique_id,seller_company_name,seller_history_name,customer_company_name,contract_affiliate,ferc_tariff_reference,contract_service_agreement_id,contract_execution_date,commencement_date_of_contract_term,contract_termination_date,...,rate_minimum,rate_maximum,rate_description,rate_units,point_of_receipt_balancing_authority,point_of_receipt_specific_location,point_of_delivery_balancing_authority,point_of_delivery_specific_location,begin_date,end_date
21823,C6068,California Independent System Operator Corpora...,,"20SD 8me, LLC",N,"FERC Electric Tariff, Fifth Replacement",4742-GISPA,20180629,20180629,,...,0.0,0.0,No Cost,,,,,,,
34245,C6068,"Midcontinent Independent System Operator, Inc.",,"Flat Fork Solar, LLC",N,FERC Electric Tariff Fifth Revised Volume No. 1,3429-MPFCA-Batesville,20200224,20200224,,...,,,No Rates in this Contract,,,,,,,


##  Each Column's  uniqueness

In [61]:
(~contracts.isna()).sum()

contract_unique_id                       177530
seller_company_name                      177530
seller_history_name                           0
customer_company_name                    177530
contract_affiliate                       177530
ferc_tariff_reference                    177523
contract_service_agreement_id            177510
contract_execution_date                  177530
commencement_date_of_contract_term       177530
contract_termination_date                 65348
actual_termination_date                    3766
extension_provision_description          164675
class_name                               103587
term_name                                155677
increment_name                            94498
increment_peaking_name                    70393
product_type_name                        177530
product_name                             177530
quantity                                  74644
units                                     97161
rate                                    

In [64]:
(contracts.nunique() / (~contracts.isna()).sum()).sort_values(ascending=False)

contract_service_agreement_id            0.250493
end_date                                 0.171085
rate_description                         0.163154
rate                                     0.131964
begin_date                               0.131224
contract_unique_id                       0.113102
customer_company_name                    0.081750
actual_termination_date                  0.075412
point_of_delivery_specific_location      0.067488
contract_termination_date                0.060721
point_of_receipt_specific_location       0.054249
quantity                                 0.046260
contract_execution_date                  0.043103
commencement_date_of_contract_term       0.040624
ferc_tariff_reference                    0.021699
extension_provision_description          0.014501
rate_maximum                             0.013378
seller_company_name                      0.012899
rate_minimum                             0.009402
point_of_receipt_balancing_authority     0.002001


## Identities

In [83]:
with engine.connect() as conn:
    identities = pd.read_sql_table("identities", conn)

In [84]:
identities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5617 entries, 0 to 5616
Data columns (total 14 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   filer_unique_id                                  5617 non-null   object
 1   company_name                                     5617 non-null   object
 2   company_identifier                               5617 non-null   object
 3   contact_name                                     5617 non-null   object
 4   contact_title                                    5617 non-null   object
 5   contact_address                                  5617 non-null   object
 6   contact_city                                     5617 non-null   object
 7   contact_state                                    5617 non-null   object
 8   contact_zip                                      5617 non-null   object
 9   contact_country_name                     

In [85]:
(identities.company_identifier + identities.filer_unique_id).value_counts().describe()

count    5537.000000
mean        1.014448
std         0.745997
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max        53.000000
dtype: float64

In [86]:
identities.company_identifier.value_counts()

D004308    53
D003672    20
C001801     6
C002062     4
C004572     4
           ..
C010180     1
C002315     1
C002314     1
C004935     1
C001742     1
Name: company_identifier, Length: 2759, dtype: int64

In [87]:
identities.query("company_identifier == 'D004308'").drop_duplicates()

Unnamed: 0,filer_unique_id,company_name,company_identifier,contact_name,contact_title,contact_address,contact_city,contact_state,contact_zip,contact_country_name,contact_phone,contact_email,transactions_reported_to_index_price_publishers,filing_quarter
682,FA1,Customized Energy Solutions,D004308,Aaron R Jacobs-Smith,Associate Counsel,1221 2nd Street,Santa Monica,CA,90401,US,(424) 272-1998,regulatory@helloinspire.com,N,202003
1083,FA1,Customized Energy Solutions,D004308,Harry Dessender,Senior Consultant,"1528 Walnut Street, 22nd Floor",Philadelphia,PA,19102,US,267-331-4392,hdessender@ces-ltd.com,N,202003
3469,FA1,Customized Energy Solutions,D004308,Caleb Gaddes,Consultant,"1528 Walnut Street, 22nd Floor",Philadelphia,PA,19102,US,215-964-6237,cgaddes@ces-ltd.com,N,202003


In [88]:
identities.filer_unique_id.value_counts()

FA1    2753
FS1    2753
FS2     103
FA2       4
FS3       3
FA3       1
Name: filer_unique_id, dtype: int64

In [89]:
company_identifier_with_multiple_codes = identities.query("filer_unique_id == 'FS2' | filer_unique_id == 'FS3'").company_identifier

In [90]:
company_identifier_with_multiple_codes = identities.loc[identities.company_identifier.isin(company_identifier_with_multiple_codes)]

In [93]:
company_identifier_with_multiple_codes.company_identifier.value_counts()

C001801    6
C004572    4
C002062    4
C007658    3
C010095    3
          ..
C007615    3
C002467    3
C005299    3
C000319    3
C000315    3
Name: company_identifier, Length: 103, dtype: int64

In [99]:
company_identifier_with_multiple_codes.query("company_identifier == 'C001801'")

Unnamed: 0,filer_unique_id,company_name,company_identifier,contact_name,contact_title,contact_address,contact_city,contact_state,contact_zip,contact_country_name,contact_phone,contact_email,transactions_reported_to_index_price_publishers,filing_quarter
2804,FS1,"Merrill Lynch Commodities, Inc.",C001801,Suzette Guerrero,Vice President,20 East Greenway Plaza,Houston,TX,77046,US,8326815699,suzette.guerrero@baml.com,N,202003
2805,FA1,"Merrill Lynch Commodities, Inc.",C001801,Suzette Guerrero,Vice President,20 East Greenway Plaza,Houston,TX,77046,US,8326815699,suzette.guerrero@baml.com,N,202003
2806,FS2,"Merrill Lynch Commodities, Inc.",C001801,Lindsay Thomas,AVP,20 East Greenway Plaza,Houston,TX,77046,US,8326815333,lthomas6@bofa.com,N,202003
2807,FA2,"Merrill Lynch Commodities, Inc.",C001801,Lindsay Thomas,AVP,20 East Greenway Plaza,Houston,TX,77046,US,8326815333,lthomas6@bofa.com,N,202003
2808,FS3,"Merrill Lynch Commodities, Inc.",C001801,LaKeisha Rowe,AVP,20 East Greenway Plaza,Houston,TX,77046,US,8326815978,lakeisha.d.rowe@bofa.com,N,202003
2809,FA3,"Merrill Lynch Commodities, Inc.",C001801,LaKeisha Rowe,AVP,20 East Greenway Plaza,Houston,TX,77046,US,8326815978,lakeisha.d.rowe@bofa.com,N,202003


In [102]:
other_cols = list(filter(lambda x: x != "filer_unique_id", company_identifier_with_multiple_codes.columns))
company_identifier_with_multiple_codes.query("company_identifier == 'C001801'").drop_duplicates(subset=other_cols)

Unnamed: 0,filer_unique_id,company_name,company_identifier,contact_name,contact_title,contact_address,contact_city,contact_state,contact_zip,contact_country_name,contact_phone,contact_email,transactions_reported_to_index_price_publishers,filing_quarter
2804,FS1,"Merrill Lynch Commodities, Inc.",C001801,Suzette Guerrero,Vice President,20 East Greenway Plaza,Houston,TX,77046,US,8326815699,suzette.guerrero@baml.com,N,202003
2806,FS2,"Merrill Lynch Commodities, Inc.",C001801,Lindsay Thomas,AVP,20 East Greenway Plaza,Houston,TX,77046,US,8326815333,lthomas6@bofa.com,N,202003
2808,FS3,"Merrill Lynch Commodities, Inc.",C001801,LaKeisha Rowe,AVP,20 East Greenway Plaza,Houston,TX,77046,US,8326815978,lakeisha.d.rowe@bofa.com,N,202003
