# Imports

In [1]:
import os
import pandas as pd
from google.cloud import bigquery

In [2]:
# set credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "../documents/key.json"
# creates a client
client = bigquery.Client()

# Loading Data

In [3]:
# References
ds_ref = client.dataset('cfpb_complaints', project='bigquery-public-data')
df_ref = ds_ref.table('complaint_database')
# API - request Fetch the table
df = client.get_table(df_ref)
df.schema

[SchemaField('date_received', 'DATE', 'NULLABLE', 'Date the complaint was received by the CPFB', (), None),
 SchemaField('product', 'STRING', 'NULLABLE', 'The type of product the consumer identified in the complaint', (), None),
 SchemaField('subproduct', 'STRING', 'NULLABLE', 'The type of sub-product the consumer identified in the complaint', (), None),
 SchemaField('issue', 'STRING', 'NULLABLE', 'The issue the consumer identified in the complaint', (), None),
 SchemaField('subissue', 'STRING', 'NULLABLE', 'The sub-issue the consumer identified in the complaint', (), None),
 SchemaField('consumer_complaint_narrative', 'STRING', 'NULLABLE', 'A description of the complaint provided by the consumer', (), None),
 SchemaField('company_public_response', 'STRING', 'NULLABLE', "The company's optional, public-facing response to a consumer's complaint", (), None),
 SchemaField('company_name', 'STRING', 'NULLABLE', 'Name of the company identified in the complaint by the consumer', (), None),
 Sc

In [4]:
client.list_rows(df, max_results=5).to_dataframe()

  if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client):


Unnamed: 0,date_received,product,subproduct,issue,subissue,consumer_complaint_narrative,company_public_response,company_name,state,zip_code,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id
0,2021-03-10,Credit card or prepaid card,Government benefit card,Advertising,Confusing or misleading advertising about the ...,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",VA,236XX,,,Web,2021-03-10,In progress,True,,4202423
1,2021-01-28,Credit card or prepaid card,General-purpose prepaid card,Advertising,Confusing or misleading advertising about the ...,,,"Populus Financial Group, Inc.",MI,,,,Web,2021-01-28,Closed with explanation,True,,4099910
2,2021-01-31,Credit card or prepaid card,General-purpose prepaid card,Advertising,Changes in terms from what was offered or adve...,,,AMERICAN EXPRESS COMPANY,NY,11803,,,Web,2021-01-31,Closed with monetary relief,True,,4106307
3,2021-02-21,Credit card or prepaid card,General-purpose prepaid card,Advertising,Confusing or misleading advertising about the ...,,,CARD Corporation,CA,932XX,,,Web,2021-02-21,Closed with explanation,True,,4155271
4,2021-01-26,"Money transfer, virtual currency, or money ser...",Virtual currency,Fraud or scam,,,,"Coinbase, Inc.",,,,,Web,2021-01-26,Closed with explanation,True,,4096429


In [35]:
query = """
        SELECT date_received, product, subproduct, issue, company_name, 
        state, date_sent_to_company, company_response_to_consumer, timely_response, 
        complaint_id
        FROM `bigquery-public-data.cfpb_complaints.complaint_database`
        WHERE EXTRACT(YEAR FROM date_received) >= 2018 OR company_response_to_consumer = 'In progress'
        ORDER BY date_received DESC
        """

In [36]:
# setting limits
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)

In [37]:
query_job = client.query(query, job_config=safe_config)
# API request
raw_df = query_job.to_dataframe()
raw_df.head()

Unnamed: 0,date_received,product,subproduct,issue,company_name,state,date_sent_to_company,company_response_to_consumer,timely_response,complaint_id
0,2021-03-20,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,2021-03-20,In progress,True,4231432
1,2021-03-20,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",IL,2021-03-20,In progress,True,4232518
2,2021-03-20,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",WI,2021-03-20,In progress,True,4231457
3,2021-03-20,Debt collection,Auto debt,False statements or representation,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",UT,2021-03-20,In progress,True,4232490
4,2021-03-20,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,2021-03-20,In progress,True,4231241


In [38]:
# Copy
backup_raw_df = raw_df

# Data Preparation

In [29]:
raw_df.describe()

  raw_df.describe()
  raw_df.describe()


Unnamed: 0,date_received,product,subproduct,issue,company_name,state,date_sent_to_company,company_response_to_consumer,timely_response,complaint_id
count,1999428,1999428,1764263,1999428,1999428,1962520,1999428,1999428,1999428,1999428.0
unique,3397,18,76,165,5929,63,3346,7,2,1999428.0
top,2017-09-08 00:00:00,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,"EQUIFAX, INC.",CA,2017-09-08 00:00:00,Closed with explanation,True,3011770.0
freq,3553,658719,650334,428250,243174,261927,3387,1630782,1959158,1.0
first,2011-12-01 00:00:00,,,,,,2011-12-01 00:00:00,,,
last,2021-03-20 00:00:00,,,,,,2021-03-20 00:00:00,,,


In [30]:
raw_df.dtypes

date_received                   datetime64[ns]
product                                 object
subproduct                              object
issue                                   object
company_name                            object
state                                   object
date_sent_to_company            datetime64[ns]
company_response_to_consumer            object
timely_response                           bool
complaint_id                            object
dtype: object

In [31]:
dates = ['date_received', 'date_sent_to_company']
for date in dates:
    raw_df[date] = pd.to_datetime(raw_df[date], format='%Y-%m-%d')

# Descriptive Statistics

In [None]:
# Frquencies
# Proportions
# Marginals - Cross tabulation
# 

# Graphs and Insights