In [2]:
import numpy as np
import pandas as pd
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
import math
from sklearn.model_selection import train_test_split,KFold

In [3]:
train_file = "Consumer_Complaints_train.csv"
test_file  = "Consumer_Complaints_test.csv"

In [4]:
train=pd.read_csv(train_file)
test=pd.read_csv(test_file)

In [5]:
for col in ['Date received','Date sent to company']:
    train[col]=pd.to_datetime(train[col],infer_datetime_format=True)
    test[col]=pd.to_datetime(test[col],infer_datetime_format=True)

In [6]:
train['day_diff']=pd.to_numeric(train['Date sent to company']-train['Date received'])
test['day_diff']=pd.to_numeric(test['Date sent to company']-test['Date received'])

In [7]:
for col in ['Date received','Date sent to company']:
    train.drop([col],1,inplace=True)
    test.drop([col],1,inplace=True)

In [8]:
train['day_diff'].describe()

count    4.784210e+05
mean     3.847541e+14
std      1.435619e+15
min     -8.640000e+13
25%      0.000000e+00
50%      8.640000e+13
75%      3.456000e+14
max      8.579520e+16
Name: day_diff, dtype: float64

In [9]:
for col in train.select_dtypes(['object']).columns:
    print(col,':',train[col].nunique())

Product : 12
Sub-product : 47
Issue : 95
Sub-issue : 68
Consumer complaint narrative : 74019
Company public response : 10
Company : 3276
State : 62
ZIP code : 25962
Tags : 3
Consumer consent provided? : 4
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [10]:
train.isnull().sum()

Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Tags                            411215
Consumer consent provided?      342934
Submitted via                        0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
day_diff                             0
dtype: int64

In [11]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    train[varname]=np.where(pd.isnull(train[col]),1,0)
    train.drop([col],1,inplace=True)
    test[varname]=np.where(pd.isnull(test[col]),1,0)
    test.drop([col],1,inplace=True)

In [12]:
train.head()

Unnamed: 0,Product,Issue,Company,State,ZIP code,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,day_diff,Sub_product_isNan,Sub_issue_isNan,Consumer_complaint_narrative_isNan,Company_public_response_isNan,Tags_isNan,Consumer_consent_provided_isNan
0,Credit card,Billing statement,Wells Fargo & Company,MI,48342,Web,Closed with explanation,Yes,No,856103,86400000000000,1,1,1,1,0,1
1,Bank account or service,"Making/receiving payments, sending money",Santander Bank US,PA,18042,Referral,Closed,Yes,No,1034666,518400000000000,0,1,1,1,1,1
2,Credit reporting,Incorrect information on credit report,Equifax,CA,92427,Referral,Closed with non-monetary relief,Yes,No,756363,1814400000000000,1,0,1,1,1,1
3,Credit card,Billing statement,U.S. Bancorp,GA,305XX,Web,Closed with monetary relief,Yes,No,1474177,0,1,1,0,0,0,0
4,Credit card,Transaction issue,Bank of America,MA,02127,Web,Closed with explanation,Yes,No,1132572,691200000000000,1,1,1,1,1,1


In [13]:
test.head()

Unnamed: 0,Product,Issue,Company,State,ZIP code,Submitted via,Company response to consumer,Timely response?,Complaint ID,day_diff,Sub_product_isNan,Sub_issue_isNan,Consumer_complaint_narrative_isNan,Company_public_response_isNan,Tags_isNan,Consumer_consent_provided_isNan
0,Bank account or service,Deposits and withdrawals,Bank of America,CA,95691,Web,Closed with explanation,Yes,675956,-86400000000000,0,1,1,1,1,1
1,Debt collection,Cont'd attempts collect debt not owed,"National Credit Adjusters, LLC",FL,32086,Web,Closed with explanation,Yes,1858795,0,0,0,1,1,1,0
2,Mortgage,"Loan servicing, payments, escrow account",Wells Fargo & Company,CA,94618,Web,Closed without relief,Yes,32637,86400000000000,0,1,1,1,1,1
3,Credit reporting,Unable to get credit report/credit score,"TransUnion Intermediate Holdings, Inc.",FL,33584,Postal mail,Closed with non-monetary relief,Yes,1731374,432000000000000,1,0,1,0,0,1
4,Mortgage,"Loan modification,collection,foreclosure",Bank of America,FL,33543,Web,Closed with explanation,Yes,501487,0,0,1,1,1,1,1


In [14]:
for col in train.select_dtypes(['object']).columns:
    print(col,':',train[col].nunique())

Product : 12
Issue : 95
Company : 3276
State : 62
ZIP code : 25962
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [15]:
for col in ['ZIP code','Company']:
    train.drop([col],1,inplace=True)
    test.drop([col],1,inplace=True)

In [16]:
train['Consumer disputed?']=np.where(train['Consumer disputed?']=="Yes",1,0)

In [17]:
train['Timely response?']=np.where(train['Timely response?']=="Yes",1,0)

In [18]:
test['Timely response?']=np.where(test['Timely response?']=="Yes",1,0)

In [19]:
k=train['Issue'].value_counts()
k

Loan modification,collection,foreclosure    80302
Incorrect information on credit report      58527
Loan servicing, payments, escrow account    51403
Cont'd attempts collect debt not owed       36367
Account opening, closing, or management     23568
Disclosure verification of debt             16235
Communication tactics                       15312
Deposits and withdrawals                    14721
Application, originator, mortgage broker    11201
Billing disputes                             9600
Credit reporting company's investigation     9492
Other                                        9442
Managing the loan or lease                   8905
Problems caused by my funds being low        7758
False statements or representation           7074
Unable to get credit report/credit score     7060
Dealing with my lender or servicer           6460
Improper contact or sharing of info          6182
Problems when you are unable to pay          5921
Settlement process and costs                 5834


In [20]:
k=train['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    train[varname]=np.where(train['Issue']==val,1,0)
    test[varname]=np.where(test['Issue']==val,1,0)
del train['Issue']
del test['Issue']

In [21]:
train.head()

Unnamed: 0,Product,State,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,day_diff,Sub_product_isNan,Sub_issue_isNan,...,Issue_Loan_modification_collection_foreclosure,Issue_Incorrect_information_on_credit_report,Issue_Loan_servicing__payments__escrow_account,Issue_Cont'd_attempts_collect_debt_not_owed,Issue_Account_opening__closing__or_management,Issue_Disclosure_verification_of_debt,Issue_Communication_tactics,Issue_Deposits_and_withdrawals,Issue_Application__originator__mortgage_broker,Issue_Billing_disputes
0,Credit card,MI,Web,Closed with explanation,1,0,856103,86400000000000,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Bank account or service,PA,Referral,Closed,1,0,1034666,518400000000000,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Credit reporting,CA,Referral,Closed with non-monetary relief,1,0,756363,1814400000000000,1,0,...,0,1,0,0,0,0,0,0,0,0
3,Credit card,GA,Web,Closed with monetary relief,1,0,1474177,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,Credit card,MA,Web,Closed with explanation,1,0,1132572,691200000000000,1,1,...,0,0,0,0,0,0,0,0,0,0


In [22]:
for col in ["Timely response?"]:
    train[col]=pd.to_numeric(train[col],errors="coerce")

In [23]:
train["Company response to consumer"]=[x.replace("Closed with explanation","1") for x in train["Company response to consumer"]]
train["Company response to consumer"]=[x.replace("Closed with non-monetary relief","2") for x in train["Company response to consumer"]]
train["Company response to consumer"]=[x.replace("Closed with monetary relief","3") for x in train["Company response to consumer"]]
train["Company response to consumer"]=[x.replace("Closed without relief","4") for x in train["Company response to consumer"]]
train["Company response to consumer"]=[x.replace("Closed with relief","6") for x in train["Company response to consumer"]] 
train["Company response to consumer"]=[x.replace("Closed","5") for x in train["Company response to consumer"]] 

In [24]:
train['Company response to consumer'].value_counts()

1                    354310
2                     61491
3                     32925
4                     14145
5                     11365
6                      4184
Untimely response         1
Name: Company response to consumer, dtype: int64

In [25]:
train["Company response to consumer"]=[x.replace("Untimely response","7") for x in train["Company response to consumer"]]

In [26]:
train["Submitted via"]=[x.replace("Web","1") for x in train["Submitted via"]]
train["Submitted via"]=[x.replace("Referral","2") for x in train["Submitted via"]]
train["Submitted via"]=[x.replace("Phone","3") for x in train["Submitted via"]]
train["Submitted via"]=[x.replace("Postal mail","4") for x in train["Submitted via"]]
train["Submitted via"]=[x.replace("Fax","5") for x in train["Submitted via"]]
train["Submitted via"]=[x.replace("Email","6") for x in train["Submitted via"]]

In [27]:
train["Product"]=[x.replace("Mortgage","1") for x in train["Product"]]
train["Product"]=[x.replace("Debt collection","2") for x in train["Product"]]
train["Product"]=[x.replace("Credit reporting","3") for x in train["Product"]]
train["Product"]=[x.replace("Credit card","4") for x in train["Product"]]
train["Product"]=[x.replace("Bank account or service","5") for x in train["Product"]]
train["Product"]=[x.replace("Consumer Loan","6") for x in train["Product"]]
train["Product"]=[x.replace("Student loan","7") for x in train["Product"]]
train["Product"]=[x.replace("Payday loan","8") for x in train["Product"]]
train["Product"]=[x.replace("Money transfers","9") for x in train["Product"]]
train["Product"]=[x.replace("Prepaid card","10") for x in train["Product"]]
train["Product"]=[x.replace("Other financial service","11") for x in train["Product"]]
train["Product"]=[x.replace("Virtual currency","12") for x in train["Product"]]

In [28]:
for col in ["Product","Submitted via","Company response to consumer"]:
    train[col]=pd.to_numeric(train[col],errors="coerce")

In [29]:
k=train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    train[varname]=np.where(train['State']==val,1,0)
    test[varname]=np.where(test['State']==val,1,0)
del train['State']
del test['State']

In [30]:
test["Company response to consumer"]=[x.replace("Closed with explanation","1") for x in test["Company response to consumer"]]
test["Company response to consumer"]=[x.replace("Closed with non-monetary relief","2") for x in test["Company response to consumer"]]
test["Company response to consumer"]=[x.replace("Closed with monetary relief","3") for x in test["Company response to consumer"]]
test["Company response to consumer"]=[x.replace("Closed without relief","4") for x in test["Company response to consumer"]]
test["Company response to consumer"]=[x.replace("Closed with relief","6") for x in test["Company response to consumer"]]
test["Company response to consumer"]=[x.replace("Closed","5") for x in test["Company response to consumer"]]

In [31]:
test["Company response to consumer"]=[x.replace("Untimely response","7") for x in test["Company response to consumer"]]

In [32]:
test["Product"]=[x.replace("Mortgage","1") for x in test["Product"]]
test["Product"]=[x.replace("Debt collection","2") for x in test["Product"]]
test["Product"]=[x.replace("Credit reporting","3") for x in test["Product"]]
test["Product"]=[x.replace("Credit card","4") for x in test["Product"]]
test["Product"]=[x.replace("Bank account or service","5") for x in test["Product"]]
test["Product"]=[x.replace("Consumer Loan","6") for x in test["Product"]]
test["Product"]=[x.replace("Student loan","7") for x in test["Product"]]
test["Product"]=[x.replace("Payday loan","8") for x in test["Product"]]
test["Product"]=[x.replace("Money transfers","9") for x in test["Product"]]
test["Product"]=[x.replace("Prepaid card","10") for x in test["Product"]]
test["Product"]=[x.replace("Other financial service","11") for x in test["Product"]]
test["Product"]=[x.replace("Virtual currency","12") for x in test["Product"]]

In [33]:
test['Submitted via'] = test['Submitted via'].fillna(" ")

In [34]:
test.dtypes

Product                                           object
Submitted via                                     object
Company response to consumer                      object
Timely response?                                   int32
Complaint ID                                       int64
day_diff                                           int64
Sub_product_isNan                                  int32
Sub_issue_isNan                                    int32
Consumer_complaint_narrative_isNan                 int32
Company_public_response_isNan                      int32
Tags_isNan                                         int32
Consumer_consent_provided_isNan                    int32
Issue_Loan_modification_collection_foreclosure     int32
Issue_Incorrect_information_on_credit_report       int32
Issue_Loan_servicing__payments__escrow_account     int32
Issue_Cont'd_attempts_collect_debt_not_owed        int32
Issue_Account_opening__closing__or_management      int32
Issue_Disclosure_verification_o

In [35]:
test["Submitted via"]=[x.replace("Web","1") for x in test["Submitted via"]]
test["Submitted via"]=[x.replace("Referral","2") for x in test["Submitted via"]]
test["Submitted via"]=[x.replace("Phone","3") for x in test["Submitted via"]]
test["Submitted via"]=[x.replace("Postal mail","4") for x in test["Submitted via"]]
test["Submitted via"]=[x.replace("Fax","5") for x in test["Submitted via"]]
test["Submitted via"]=[x.replace("Email","6") for x in test["Submitted via"]]

In [36]:
test["Submitted via"]=[x.replace(" ","7") for x in test["Submitted via"]]

In [37]:
test['Submitted via'].value_counts()

1    78448
2    22879
3     8541
4     7900
5     1756
6       81
7        1
Name: Submitted via, dtype: int64

In [38]:
for col in ["Product","Submitted via","Company response to consumer"]:
    test[col]=pd.to_numeric(test[col],errors="coerce")

In [39]:
train.isnull().sum()

Product                                           0
Submitted via                                     0
Company response to consumer                      0
Timely response?                                  0
Consumer disputed?                                0
Complaint ID                                      0
day_diff                                          0
Sub_product_isNan                                 0
Sub_issue_isNan                                   0
Consumer_complaint_narrative_isNan                0
Company_public_response_isNan                     0
Tags_isNan                                        0
Consumer_consent_provided_isNan                   0
Issue_Loan_modification_collection_foreclosure    0
Issue_Incorrect_information_on_credit_report      0
Issue_Loan_servicing__payments__escrow_account    0
Issue_Cont'd_attempts_collect_debt_not_owed       0
Issue_Account_opening__closing__or_management     0
Issue_Disclosure_verification_of_debt             0
Issue_Commun

In [40]:
x=train.drop(['Consumer disputed?','Complaint ID'],1)
y=train['Consumer disputed?']

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
clf = RandomForestClassifier(verbose=1,n_jobs=-1)
# this here is the base classifier we are going to try
# we will be supplying different parameter ranges to our randomSearchCV which in turn
# will pass it on to this classifier

# Utility function to report best scores. This simply accepts grid scores from 
# our randomSearchCV/GridSearchCV and picks and gives top few combination according to 
# their scores

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    # above line selects top n grid scores
    # for loop below , prints the rank, score and parameter combination
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")
    
# RandomSearchCV/GridSearchCV accept parameters values as dictionaries.
# In example given below we have constructed dictionary for 
#different parameter values that we want to
# try for randomForest model
param_dist = {"n_estimators":[10,100,500,700],
              "max_depth": [3,5, None],
              "max_features": sp_randint(5, 11),
              "min_samples_split": sp_randint(5, 11),
              "min_samples_leaf": sp_randint(5, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 5
# n_iter parameter of RandomizedSeacrhCV controls, how many 
# parameter combination will be tried; out of all possible given values

random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(x,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   13.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 t

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs

In [43]:
rf=RandomForestClassifier(n_estimators=500,verbose=1,criterion='entropy',min_samples_split=7,
                         bootstrap=False,max_depth=None,max_features=8,min_samples_leaf=5,
                          class_weight="balanced")

In [44]:
rf.fit(x,y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 26.1min finished


RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features=8,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=5,
                       min_samples_split=7, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=None, oob_score=False,
                       random_state=None, verbose=1, warm_start=False)

In [45]:
y = rf.predict(test.drop(['Complaint ID'],1))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   23.1s finished


In [46]:
submission=pd.DataFrame(list(zip(test['Complaint ID'],list(y))),
                       columns=['Complaint ID','Consumer disputed?'])

In [47]:
submission.head()

Unnamed: 0,Complaint ID,Consumer disputed?
0,675956,0
1,1858795,0
2,32637,1
3,1731374,0
4,501487,1


In [62]:
submission.to_csv('Bharath_Reddy_Python_Project.csv',index=False)