In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [3]:
train_data = pd.read_csv('Consumer_Complaints_train.csv', nrows=70000)
test_data = pd.read_csv('Consumer_Complaints_test_share.csv',  nrows=20000)

In [4]:
train_data.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [17]:
for col in train_data.columns:
    print(col,' :',train_data[col].nunique())

Company response to consumer_Closed with explanation  : 2
Company response to consumer_Closed with monetary relief  : 2
Company response to consumer_Closed with non-monetary relief  : 2
Company response to consumer_Closed with relief  : 2
Company response to consumer_Closed without relief  : 2
Submitted via_Fax  : 2
Submitted via_Phone  : 2
Submitted via_Postal mail  : 2
Submitted via_Referral  : 2
Submitted via_Web  : 2
Product  : 12
Issue  : 93
Consumer complaint narrative  : 11003
Company public response  : 2
Consumer disputed?  : 2
day_diff  : 247
Timely_response  : 2
date_1  : 2
date_2  : 2
date_3  : 2
date_4  : 2
Sub_product_isNan  : 2
Sub_issue_isNan  : 2
Consumer consent provided?_isNan  : 2
State_CA  : 2
State_FL  : 2
State_TX  : 2
State_NY  : 2
State_GA  : 2
State_NJ  : 2
State_PA  : 2
State_IL  : 2
State_MD  : 2
State_VA  : 2
State_OH  : 2
State_NC  : 2
State_MI  : 2
State_AZ  : 2
State_WA  : 2


In [None]:
for col in test_data.columns:
    print(col,' :',test_data[col].nunique())

In [5]:
for col in ['Date received','Date sent to company']:
    train_data[col]=pd.to_datetime(train_data[col],infer_datetime_format=True)
    test_data[col]=pd.to_datetime(test_data[col],infer_datetime_format=True)

In [6]:
train_data['day_diff']=pd.to_numeric(train_data['Date sent to company']-train_data['Date received'])
test_data['day_diff']=pd.to_numeric(test_data['Date sent to company']-test_data['Date received'])

In [7]:
train_data['Consumer disputed?']=np.where(train_data['Consumer disputed?']=="Yes",1,0)

In [8]:
train_data['Timely_response']=np.where(train_data['Timely response?']=='No',0,1)
train_data.drop(train_data.ix[:, ['Timely response?']], axis=1, inplace=True)
test_data['Timely_response']=np.where(test_data['Timely response?']=='No',0,1)
test_data.drop(test_data.ix[:, ['Timely response?']], axis=1, inplace=True)

In [9]:
train_data['Date received'] = pd.to_datetime(train_data['Date received']).dt.month
test_data['Date received'] = pd.to_datetime(test_data['Date received']).dt.month

In [None]:
round(train_data.groupby("Date received")['Consumer disputed?'].mean(),2)

In [10]:
for i in range(len(train_data)):
    if train_data["Date received"][i] in [1,2,3,4,11,12]:
        train_data.loc[i,"Date received"]="date_1"
    if train_data["Date received"][i] in [5,10]:
        train_data.loc[i,"Date received"]="date_2"
    if train_data["Date received"][i] in [6,9]:
        train_data.loc[i,"Date received"]="date_3"
    if train_data["Date received"][i] in [7]:
        train_data.loc[i,"Date received"]="date_4" 

        
temp=pd.get_dummies(train_data["Date received"]).applymap(np.int64)
train_data=pd.concat([train_data,temp],1)
train_data.drop([8,'Date received'],1,inplace=True)            

In [None]:
#train_data.shape

In [11]:
for i in range(len(test_data)):
    if test_data["Date received"][i] in [1,2,3,4,11,12]:
        test_data.loc[i,"Date received"]="date_1"
    if test_data["Date received"][i] in [5,10]:
        test_data.loc[i,"Date received"]="date_2"
    if test_data["Date received"][i] in [6,9]:
        test_data.loc[i,"Date received"]="date_3"
    if test_data["Date received"][i] in [7]:
        test_data.loc[i,"Date received"]="date_4" 
        
temp=pd.get_dummies(test_data["Date received"]).applymap(np.int64)
test_data=pd.concat([test_data,temp],1)
test_data.drop([8,'Date received'],1,inplace=True)        

In [11]:
for x in ['Date sent to company','Company', 'ZIP code', 'Tags']:
    del train_data[x]   
    del test_data[x]

In [12]:
train_data.drop(['Complaint ID'], 1, inplace=True)

In [13]:
for col in ['Submitted via', 'Company response to consumer']:
    temp=pd.get_dummies(train_data[col],prefix=col,drop_first=True).applymap(np.int64)
    train_data=pd.concat([temp,train_data],1)
    train_data.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(test_data[col],prefix=col,drop_first=True).applymap(np.int64)
    test_data=pd.concat([temp,test_data],1)
    test_data.drop([col],1,inplace=True)    

In [14]:
train_data['Company public response']=np.where(pd.isnull(train_data['Company public response']),1,0)
test_data['Company public response']=np.where(pd.isnull(test_data['Company public response']),1,0)

In [15]:
for col in ['Sub-product','Sub-issue', 'Consumer consent provided?']:
    varname=col.replace('-','_')+'_isNan'
    train_data[varname]=np.where(pd.isnull(train_data[col]),1,0)
    train_data.drop([col],1,inplace=True)
    test_data[varname]=np.where(pd.isnull(test_data[col]),1,0)
    test_data.drop([col],1,inplace=True)

In [16]:
k=train_data['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    train_data[varname]=np.where(train_data['State']==val,1,0)
    test_data[varname]=np.where(test_data['State']==val,1,0)
del train_data['State']
del test_data['State']

In [18]:
# le=LabelEncoder()

In [19]:
# le.fit(train_data['Product'].values)
# train_data['Product']=le.transform(train_data['Product'])
# test_data['Product']=le.transform(test_data['Product'])

In [18]:
temp=pd.get_dummies(train_data["Product"]).applymap(np.int64)
train_data=pd.concat([train_data,temp],1)
train_data.drop(['Product'],1,inplace=True)     

In [19]:
train_data['Consumer complaint narrative']=train_data['Consumer complaint narrative'].fillna(value='')
test_data['Consumer complaint narrative']=test_data['Consumer complaint narrative'].fillna(value='')

In [None]:
test_data.columns.values

In [None]:
train_data.columns.values

In [20]:
from textblob import TextBlob
from nltk.corpus import stopwords
from string import punctuation
stop = set(stopwords.words('english')+list(punctuation))
from nltk.stem.lancaster import LancasterStemmer

In [21]:
st = LancasterStemmer()

In [22]:
def split_into_lemmas(message):
    message=str(message).lower()
    words = TextBlob(message).words
    words_sans_stop=[]
    for word in words :
        if word not in stop:
            if len(word)>4:
                words_sans_stop.append(word)
    return [st.stem(word) for word in words_sans_stop if not word.startswith('x') ]

In [23]:
def split_into_lemma(message):
    message=message.lower()
    words = TextBlob(message).words
    words_sans_stop=[]
    for word in words :
        if word not in stop:
            words_sans_stop.append(word)
    return [word for word in words_sans_stop]

In [24]:
tfidf= TfidfVectorizer(min_df=1, max_df=0.7, analyzer=split_into_lemma, max_features=80)

In [25]:
tfidf1= TfidfVectorizer(min_df=0.002, max_df=0.7, analyzer=split_into_lemmas, max_features=300)

In [26]:
x=train_data.drop(['Consumer disputed?'],1)

In [27]:
y=train_data['Consumer disputed?']

In [28]:
X_transformed= tfidf.fit_transform(x['Issue'])

In [29]:
X_transformed1 = tfidf1.fit_transform(x['Consumer complaint narrative'])

In [30]:
tfidf_data=pd.DataFrame(X_transformed.toarray())
tfidf_data.columns=tfidf.get_feature_names()

In [31]:
tfidf_data1 = pd.DataFrame(X_transformed1.toarray())
tfidf_data1.columns=tfidf1.get_feature_names()

In [33]:
#x.reset_index(drop=True, inplace=True)

In [32]:
x = pd.concat([x, tfidf_data], axis=1)

In [33]:
x = pd.concat([x, tfidf_data1], axis=1)

In [34]:
x.drop(['Issue', 'Consumer complaint narrative'], 1, inplace=True)

In [35]:
X_transform= tfidf.transform(test_data['Issue'])

In [36]:
X_transform1 = tfidf1.transform(test_data['Consumer complaint narrative'])

In [37]:
tfidf_data=pd.DataFrame(X_transform.toarray())
tfidf_data.columns=tfidf.get_feature_names()
tfidf_data1 = pd.DataFrame(X_transform1.toarray())
tfidf_data1.columns=tfidf1.get_feature_names()

In [38]:
test_data = pd.concat([test_data, tfidf_data], axis=1)

In [39]:
test_data = pd.concat([test_data, tfidf_data1], axis=1)

In [40]:
test_data.drop(['Issue', 'Consumer complaint narrative'], 1, inplace=True)

In [41]:
train_data.shape

(70000, 50)

In [None]:
from scipy.stats import randint as sp_randint
from time import time
from operator import itemgetter
from sklearn.grid_search import RandomizedSearchCV

In [None]:
clf = RandomForestClassifier(verbose=1,n_jobs=-1)

In [None]:
n_iter_search = 20
param_dist = {"n_estimators":[10,100,500,700],
              "max_depth": [3,5, None],
              "max_features": sp_randint(5, 11),
              "min_samples_split": sp_randint(5, 11),
              "min_samples_leaf": sp_randint(5, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [None]:
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

In [None]:
random_search.fit(x,y)

In [None]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    #above line selects top n grid scores
    #for loop below , prints the rank, score and parameter combination
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [None]:
report(random_search.grid_scores_)

In [None]:
rf=RandomForestClassifier(n_estimators=700,verbose=1,criterion='gini',min_samples_split=6,
                         bootstrap=False,max_depth=None,max_features=5,min_samples_leaf=5,
                          class_weight="balanced")

In [None]:
rf.fit(x, y)

In [None]:
test_data = pd.read_csv('Consumer_Complaints_test_share.csv', skiprows=range(1, 100001))

In [None]:
for col in ['Date received','Date sent to company']:
    test_data[col]=pd.to_datetime(test_data[col],infer_datetime_format=True)

In [None]:
test_data['day_diff']=pd.to_numeric(test_data['Date sent to company']-test_data['Date received'])

In [None]:
test_data['Timely_response']=np.where(test_data['Timely response?']=='No',0,1)
test_data.drop(test_data.ix[:, ['Timely response?']], axis=1, inplace=True)

In [None]:
test_data['Date received'] = pd.to_datetime(test_data['Date received']).dt.month

In [None]:
for i in range(len(test_data)):
    if test_data["Date received"][i] in [1,2,3,4,11,12]:
        test_data.loc[i,"Date received"]="date_1"
    if test_data["Date received"][i] in [5,10]:
        test_data.loc[i,"Date received"]="date_2"
    if test_data["Date received"][i] in [6,9]:
        test_data.loc[i,"Date received"]="date_3"
    if test_data["Date received"][i] in [7]:
        test_data.loc[i,"Date received"]="date_4" 
        
temp=pd.get_dummies(test_data["Date received"]).applymap(np.int64)
test_data=pd.concat([test_data,temp],1)
test_data.drop([8,'Date received'],1,inplace=True)        

In [None]:
for x in ['Date sent to company', 'Company', 'ZIP code', 'Tags']:    
    del test_data[x]

In [None]:
for col in ['Submitted via', 'Company response to consumer']:
    temp=pd.get_dummies(test_data[col],prefix=col,drop_first=True).applymap(np.int64)
    test_data=pd.concat([temp,test_data],1)
    test_data.drop([col],1,inplace=True)    

In [None]:
test_data['Product']=le.transform(test_data['Product'])

In [None]:
test_data['Company public response']=np.where(pd.isnull(test_data['Company public response']),1,0)

In [None]:
for col in ['Sub-product','Sub-issue', 'Consumer consent provided?']:
    varname=col.replace('-','_')+'_isNan'
    test_data[varname]=np.where(pd.isnull(test_data[col]),1,0)
    test_data.drop([col],1,inplace=True)

In [None]:
# test_data.info()

In [None]:
train_data = pd.read_csv('Consumer_Complaints_train.csv', nrows=60000)
k=train_data['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    test_data[varname]=np.where(test_data['State']==val,1,0)
del test_data['State']

In [None]:
test_data['Consumer complaint narrative']=test_data['Consumer complaint narrative'].fillna(value='')

In [None]:
#test_data.shape

In [None]:
X_transform= tfidf.transform(test_data['Issue'])

In [None]:
X_transform1 = tfidf1.transform(test_data['Consumer complaint narrative'])

In [None]:
tfidf_data=pd.DataFrame(X_transform.toarray())
tfidf_data.columns=tfidf.get_feature_names()
tfidf_data1 = pd.DataFrame(X_transform1.toarray())
tfidf_data1.columns=tfidf1.get_feature_names()

In [None]:
test_data = pd.concat([test_data, tfidf_data], axis=1)

In [None]:
test_data = pd.concat([test_data, tfidf_data1], axis=1)

In [None]:
test_data.drop(['Issue', 'Consumer complaint narrative'], 1, inplace=True)

In [None]:
#test_data.columns.values

In [None]:
#test_data.drop('Company response to consumer_Untimely response',1, inplace=True)

In [None]:
prediction=np.where(rf.predict(test_data.drop(['Complaint ID'],1))==1,"Yes","No")

In [None]:
submission=pd.DataFrame(list(zip(test_data['Complaint ID'],list(prediction))),
                       columns=['Complaint ID','Consumer disputed?'])

In [None]:
submission.to_csv('submission2.csv', mode='a', header=False, index=False)

In [None]:
#submission.to_csv('submission2.csv',index=False)