In [50]:
import requests
import pandas as pd

resp = requests.get('https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/3398126')
if resp.status_code != 200:
    # This means something went wrong.
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
print(resp.json())

{'hits': {'hits': [{'_score': 1.0, '_type': 'complaint', '_id': '3398126', '_source': {':created_at': 1575703677, 'sub_product': 'Mobile or digital wallet', 'date_sent_to_company': '2019-10-07T12:00:00-05:00', 'date_indexed_formatted': '12/16/19', 'complaint_id': '3398126', 'date_received_formatted': '10/07/19', ':updated_at': 1575703677, 'tags': None, 'state': 'PA', 'date_received': '2019-10-07T12:00:00-05:00', 'consumer_disputed': 'N/A', 'issue': 'Unauthorized transactions or other transaction problem', 'company_response': 'Closed with non-monetary relief', 'date_indexed': '2019-12-16T12:00:00-05:00', 'zip_code': '191XX', 'timely': 'Yes', 'product': 'Money transfer, virtual currency, or money service', 'complaint_what_happened': 'I was using Venmo, a PayPal company, to transfer {$180.00} into my bank account via their instant transfer option on XX/XX/XXXX at XXXX. I have contacted both my bank ( XXXX XXXX XXXX ) and Venmo numerous times. Both saying that the other is responsible for 

In [72]:
resp.json()['hits']['hits'][0]['_source']

{':created_at': 1575703677,
 'sub_product': 'Mobile or digital wallet',
 'date_sent_to_company': '2019-10-07T12:00:00-05:00',
 'date_indexed_formatted': '12/16/19',
 'complaint_id': '3398126',
 'date_received_formatted': '10/07/19',
 ':updated_at': 1575703677,
 'tags': None,
 'state': 'PA',
 'date_received': '2019-10-07T12:00:00-05:00',
 'consumer_disputed': 'N/A',
 'issue': 'Unauthorized transactions or other transaction problem',
 'company_response': 'Closed with non-monetary relief',
 'date_indexed': '2019-12-16T12:00:00-05:00',
 'zip_code': '191XX',
 'timely': 'Yes',
 'product': 'Money transfer, virtual currency, or money service',
 'complaint_what_happened': 'I was using Venmo, a PayPal company, to transfer {$180.00} into my bank account via their instant transfer option on XX/XX/XXXX at XXXX. I have contacted both my bank ( XXXX XXXX XXXX ) and Venmo numerous times. Both saying that the other is responsible for the money not being received into my account.',
 'date_sent_to_compan

In [129]:
#Gets complaint Id from request
complaint_id = int(resp.json()['hits']['hits'][0]['_source']['complaint_id'])

#Creates DataFrame from REST API
df1 = pd.DataFrame(resp.json()['hits']['hits'][0]['_source'], index=[complaint_id])

#generate drop list for before preprocessing
droplist1 = [':created_at',
             'date_indexed_formatted',
             'complaint_id',
             'date_received_formatted',
             ':updated_at',
             'date_indexed',
             'date_sent_to_company_formatted',
             'has_narrative']
#drop columns
df_drop1 = df1.drop(droplist1,axis=1)

#list that corrects names to agree with preprocessor's accepted name
corrected_cols= ['Sub-product',
                 'Date sent to company',
                 'Tags',
                 'State',
                 'Date received',
                 'Consumer disputed?',
                 'Issue',
                 'Company response to consumer',
                 'ZIP code',
                 'Timely response?',
                 'Product',
                 'Consumer complaint narrative',
                 'Company',
                 'Sub-issue',
                 'Consumer consent provided?',
                 'Company public response',
                 'Submitted via']

#rename columns
df_drop1.columns = corrected_cols

#match order of columns. Generated with list(df.columns.values) from other notebooks
reordered_cols= ['Date received',
                 'Product',
                 'Sub-product',
                 'Issue',
                 'Sub-issue',
                 'Consumer complaint narrative',
                 'Company public response',
                 'Company',
                 'State',
                 'ZIP code',
                 'Tags',
                 'Consumer consent provided?',
                 'Submitted via',
                 'Date sent to company',
                 'Company response to consumer',
                 'Timely response?',
                 'Consumer disputed?']
#actually reorderes columns
df_reordered = df_drop1[reordered_cols]

#set index name to match
df_reordered.index.name='Complaint ID'

#define dictionary to define new dtypes
dtype_dict = {'Product':"category",
             'Consumer consent provided?': "category",
             'Submitted via': "category",
             'Consumer disputed?': "category",
             'Date received':'<M8[ns]',
             'Date sent to company':'<M8[ns]'}

#change dtypes
df = df_reordered.astype(dtype_dict)

#use old code to transfrom data

#This will replace ending '-' to 5 (average linespace of 10)
regexReplaceDash = r"(\d+)(-)$"
df['ZIP code'] = df['ZIP code'].str.replace(regexReplaceDash, r'\g<1>5')

#This will change ending XX to 50 (average linespace of 100)
regex_XX = r'(\d{3})(XX)'
df['ZIP code'] = df['ZIP code'].str.replace(regex_XX, r'\g<1>50')

#This will remove all other entries that are still not 5 digits
regexRemove = r'\D+'
df['ZIP code'] = df['ZIP code'].replace(regexRemove, np.nan, regex=True)

#imputes the mean for nan 
imputeMean = df['ZIP code'].astype(np.float).mean()
df['ZIP code'] = df['ZIP code'].astype(np.float).fillna(imputeMean)

#Transforming 2 unique valued col to float boolean
booleanize = {'Yes': 1, 'No': 0}
df['Timely response?'] = pd.Series(df['Timely response?'].map(booleanize), dtype = np.float)

#function to apply to column to convert less common results to 'Other', as well as NaN
def convertToOther(value, keepList):
    if (value == ''):
        return "Other"
    else:
        return value if value in keepList else "Other"
    
#Lists top 23 value counts (allowed to exclude values), turns NaN to '' to others, converts to category dtype
def cleanReduceConvert(df, column, blackList=[]):
    keepList = []
    for category in df[column].value_counts().head(23).index.tolist():
        if (category.lower().split()[0] != "other"):
            keepList.append(category)
    for category in blackList:
        try:
            keepList.remove(category)
        except ValueError:
            pass

    df[column].fillna('', inplace=True)
    return pd.Series(df[column].apply(convertToOther, args=(keepList,)), dtype = 'category')

df['Sub-product'] = cleanReduceConvert(df, 'Sub-product', blackList= ['I do not know'])
df['Issue'] = cleanReduceConvert(df, 'Issue')
df['Sub-issue'] = cleanReduceConvert(df, 'Sub-issue')
df['Company'] = cleanReduceConvert(df, 'Company')

def entryOrNull(strVal):
    return 1.0 if strVal is not np.nan else 0.0

df['Consumer complaint narrative submitted?'] = df['Consumer complaint narrative'].apply(entryOrNull)

def dtToCols(df, dtcolumn):
    df["{} day".format(dtcolumn)] = df[dtcolumn].dt.day
    df["{} month".format(dtcolumn)] = df[dtcolumn].dt.month
    df["{} year".format(dtcolumn)] = df[dtcolumn].dt.year
    
dtToCols(df, "Date received")
dtToCols(df, "Date sent to company")

df["Consumer consent provided?"] = df["Consumer consent provided?"].cat.add_categories("Not recorded").fillna("Not recorded")

df = df.drop(df[df["Company response to consumer"].isna()].index)

dfInProgress = df[df["Company response to consumer"] == "In progress"]
df = df[df["Company response to consumer"] != "In progress"]

dfUntimelyResponse = df[df["Company response to consumer"] == "Untimely response"]
df = df[df["Company response to consumer"] != "Untimely response"]

twoOutputsDict = {"Closed with explanation":"Closed without relief", 
                  "Closed with non-monetary relief":"Closed with relief",
                  "Closed with monetary relief":"Closed with relief",
                  "Closed without relief":"Closed without relief", 
                  "Closed":"Closed without relief",
                  "Closed with relief":"Closed with relief"}
df["Company response to consumer"] = df["Company response to consumer"].map(twoOutputsDict)


#data columns not be used for the model
dropList = ["Consumer complaint narrative",
            "Company public response",
            "State",
            "Tags",
            "Consumer disputed?",
            "Date received", 
            "Date sent to company",
            "Company response to consumer"]
X = df.drop(dropList, axis=1)
Y = df["Company response to consumer"]

#Columns to be standard scaled/imputed
numeric_features = ['ZIP code',
                    'Date received day',
                    'Date received month',
                    'Date received year',
                    'Date sent to company day',
                    'Date sent to company month',
                    'Date sent to company year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Columns to one hot encoded
categorical_features = ['Product',
           'Sub-product',
           'Issue',
           'Sub-issue',
           'Company',
           'Consumer consent provided?',
           'Submitted via',
           'Timely response?']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

#building the column transformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#fit the preprocessor, then transform trainging and test set, assign sparse matrix to variables
preprocessor.fit(X)
encX_train = preprocessor.transform(X_train)
encX_test = preprocessor.transform(X_test)

ValueError: With n_samples=1, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
Change up this code below to save models and load models

In [None]:
from sklearn.externals import joblib
scaler_filename = "scaler.save"
joblib.dump(scaler, scaler_filename) 

# And now to load...

scaler = joblib.load(scaler_filename) 