# SQL - API Request and Write data

## Imports

In [2]:
import os
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import seaborn as sns
import numpy as np

In [3]:
# set credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "../documents/key.json"
# creates a client
client = bigquery.Client()

## Loading Data

In [4]:
# References
ds_ref = client.dataset('cfpb_complaints', project='bigquery-public-data')
df_ref = ds_ref.table('complaint_database')
# API - request Fetch the table
df = client.get_table(df_ref)
df.schema

[SchemaField('date_received', 'DATE', 'NULLABLE', 'Date the complaint was received by the CPFB', (), None),
 SchemaField('product', 'STRING', 'NULLABLE', 'The type of product the consumer identified in the complaint', (), None),
 SchemaField('subproduct', 'STRING', 'NULLABLE', 'The type of sub-product the consumer identified in the complaint', (), None),
 SchemaField('issue', 'STRING', 'NULLABLE', 'The issue the consumer identified in the complaint', (), None),
 SchemaField('subissue', 'STRING', 'NULLABLE', 'The sub-issue the consumer identified in the complaint', (), None),
 SchemaField('consumer_complaint_narrative', 'STRING', 'NULLABLE', 'A description of the complaint provided by the consumer', (), None),
 SchemaField('company_public_response', 'STRING', 'NULLABLE', "The company's optional, public-facing response to a consumer's complaint", (), None),
 SchemaField('company_name', 'STRING', 'NULLABLE', 'Name of the company identified in the complaint by the consumer', (), None),
 Sc

In [5]:
client.list_rows(df, max_results=5).to_dataframe()

  if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client):


Unnamed: 0,date_received,product,subproduct,issue,subissue,consumer_complaint_narrative,company_public_response,company_name,state,zip_code,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id
0,2017-03-26,Credit card,,Late fee,,,,Alliance Data Card Services,IL,60074,,,Web,2017-03-26,Closed with explanation,True,False,2404586
1,2021-01-21,Credit card or prepaid card,General-purpose prepaid card,Advertising,Confusing or misleading advertising about the ...,,,NETSPEND CORPORATION,NM,871XX,,,Web,2021-02-05,In progress,True,,4083912
2,2021-02-06,"Money transfer, virtual currency, or money ser...",Domestic (US) money transfer,Fraud or scam,,,,TD BANK US HOLDING COMPANY,MN,55449,Older American,,Web,2021-02-06,Closed with explanation,True,,4120827
3,2021-02-26,"Money transfer, virtual currency, or money ser...",International money transfer,Fraud or scam,,,,JPMORGAN CHASE & CO.,TX,770XX,,,Web,2021-03-01,In progress,True,,4169537
4,2021-03-09,"Money transfer, virtual currency, or money ser...",Domestic (US) money transfer,Fraud or scam,,,,TD BANK US HOLDING COMPANY,NY,,,,Web,2021-03-09,Closed with explanation,True,,4195384


In [6]:
query = """
        SELECT date_received, product, subproduct, issue, company_name, 
            company_response_to_consumer, timely_response, complaint_id
        FROM `bigquery-public-data.cfpb_complaints.complaint_database`
        WHERE EXTRACT(YEAR FROM date_received) >= 2018 OR company_response_to_consumer = 'In progress'
        ORDER BY date_received DESC
        """

In [7]:
# setting limits
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)

In [8]:
query_job = client.query(query, job_config=safe_config)
# API request
raw_df = query_job.to_dataframe()
raw_df.head()

Unnamed: 0,date_received,product,subproduct,issue,company_name,company_response_to_consumer,timely_response,complaint_id
0,2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4251013
1,2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4251000
2,2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4250942
3,2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4250975
4,2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4251186


## Data Preparation

In [9]:
raw_df.describe()

Unnamed: 0,date_received,product,subproduct,issue,company_name,company_response_to_consumer,timely_response,complaint_id
count,1083403,1083403,1083403,1083403,1083403,1083401,1083403,1083403
unique,1182,9,48,81,4675,5,2,1083403
top,2021-01-05,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Closed with explanation,True,3053145
freq,2309,599283,592330,398462,178011,918100,1069006,1


In [10]:
raw_df.dtypes

date_received                   object
product                         object
subproduct                      object
issue                           object
company_name                    object
company_response_to_consumer    object
timely_response                   bool
complaint_id                    object
dtype: object

In [12]:
# adjusting the date format
dates = ['date_received']
for date in dates:
    raw_df[date] = pd.to_datetime(raw_df[date], format='%Y-%m-%d')

In [13]:
raw_df.set_index('date_received', inplace=True)

In [14]:
raw_df.head()

Unnamed: 0_level_0,product,subproduct,issue,company_name,company_response_to_consumer,timely_response,complaint_id
date_received,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4251013
2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4251000
2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4250942
2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4250975
2021-03-27,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",In progress,True,4251186


In [15]:
raw_df.company_response_to_consumer.unique()

array(['In progress', 'Closed with explanation',
       'Closed with non-monetary relief', 'Closed with monetary relief',
       'Untimely response', None], dtype=object)

## Filter data

### 1 - Which companies have the most complaints?

In [16]:
comp_most_complaints = raw_df[['company_name', 'complaint_id']] \
.groupby('company_name').count().sort_values(by='complaint_id', ascending=False).head(20).index.to_list()

### 2 - Which companies have the most complaints this year?

In [17]:
comp_most_complaints_2021 = raw_df.loc['2021' , ['company_name', 'complaint_id']] \
.groupby('company_name').count().sort_values(by='complaint_id', ascending=False).head(20).index.to_list()

### 3 - Which companies have the most complaints 'in progress' or 'Untimely response'?

In [18]:
comp_most_complaints_unsolved = raw_df.loc[raw_df.company_response_to_consumer \
                                           .isin(['In progress', 'Untimely response']),
                                           ['company_name', 'complaint_id']] \
.groupby('company_name').count().sort_values(by='complaint_id', ascending=False).head(20).index.to_list()

### 4 - What are the most recurring issues per company? 

In [19]:
comp_recurring_issue = raw_df[['company_name', 'issue', 'complaint_id']] \
.groupby(['issue', 'company_name']).count().sort_values(by='complaint_id', 
                                                        ascending=False).head(20).index \
.get_level_values('company_name').to_list()

### Initial list of potential customers

In [21]:
potential_customers = list(set(comp_most_complaints) | 
                        set(comp_most_complaints_2021) |
                        set(comp_most_complaints_unsolved)|
                          set (comp_recurring_issue))
len(potential_customers)

31

In [22]:
potential_customers

['AES/PHEAA',
 'Mobiloans, LLC',
 'PORTFOLIO RECOVERY ASSOCIATES INC',
 'SYNCHRONY FINANCIAL',
 'ENCORE CAPITAL GROUP INC.',
 'Premier\xa0Student\xa0Loan\xa0Center',
 'JPMORGAN CHASE & CO.',
 'CAPITAL ONE FINANCIAL CORPORATION',
 'AMERICAN EXPRESS COMPANY',
 'PNC Bank N.A.',
 'Coinbase, Inc.',
 'Ameritech Financial',
 'Experian Information Solutions Inc.',
 'Santander Consumer USA Holdings Inc.',
 'Alliance Data Card Services',
 'ALLY FINANCIAL INC.',
 'CITIBANK, N.A.',
 'CITIZENS FINANCIAL GROUP, INC.',
 'U.S. BANCORP',
 'DISCOVER BANK',
 'TD BANK US HOLDING COMPANY',
 'WELLS FARGO & COMPANY',
 'EQUIFAX, INC.',
 'Colony Brands, Inc.',
 'Chime Financial Inc',
 'BARCLAYS BANK DELAWARE',
 'Navient Solutions, LLC.',
 'Paypal Holdings, Inc',
 'TRANSUNION INTERMEDIATE HOLDINGS, INC.',
 'NAVY FEDERAL CREDIT UNION',
 'BANK OF AMERICA, NATIONAL ASSOCIATION']

**The first step was to access large numbers. But there are few insights and that list is still large (31 companies), so the next step is to analyse especifically those companies.**

## Saving data

Saves shortlist data for further analysis

In [23]:
shortlist_1 = raw_df[raw_df.company_name.isin(potential_customers)]
compression_opts = dict(method='zip', archive_name='shortlist.csv') 
shortlist_1.to_csv('../raw_data/shortlist.zip', compression=compression_opts)

In [4]:
# Complaints by company per year
df_1 = shortlist_1[['company_name']].groupby(['company_name']).resample('Y').count().unstack(level=1)
data_1 = df_1.sort_values(by=('company_name', '2021-12-31'), ascending=False).values
index_1 = df_1.sort_values(by=('company_name', '2021-12-31'), ascending=False).index

# CSV (streamlit)
pd.DataFrame(data=data_1, index=index_1, columns=[2018, 2019, 2020, 2021]).to_csv('../tidy_data/fc_per_y.zip',
                                                                                 compression=compression_opts)