In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Data Cleaning

In [2]:
train=pd.read_csv('C:/Users/Hp/OneDrive/Documents/IIT KANPUR PYTHON/1st PROJECT/Consumer_Complaints_train.csv')

In [3]:
test=pd.read_csv('C:/Users/Hp/OneDrive/Documents/IIT KANPUR PYTHON/1st PROJECT/Consumer_Complaints_test_share.csv')

In [4]:
for col in ['Date received','Date sent to company']:
    train[col]=pd.to_datetime(train[col],infer_datetime_format=True)
    test[col]=pd.to_datetime(test[col],infer_datetime_format=True)

In [5]:
train['day_diff']=pd.to_numeric(train['Date sent to company']-train['Date received'])
test['day_diff']=pd.to_numeric(test['Date sent to company']-test['Date received'])

In [6]:
for col in ['Date received','Date sent to company']:
    train.drop([col],1,inplace=True)
    test.drop([col],1,inplace=True)

In [7]:
for col in train.select_dtypes(['object']).columns:
    print(col,':',train[col].nunique())

Product : 12
Sub-product : 47
Issue : 95
Sub-issue : 68
Consumer complaint narrative : 74019
Company public response : 10
Company : 3276
State : 62
ZIP code : 25962
Tags : 3
Consumer consent provided? : 4
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [8]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    train[varname]=np.where(pd.isnull(train[col]),1,0)
    train.drop([col],1,inplace=True)
    test[varname]=np.where(pd.isnull(test[col]),1,0)
    test.drop([col],1,inplace=True)

In [9]:
for col in train.select_dtypes(['object']).columns:
    print(col,':',train[col].nunique())

Product : 12
Issue : 95
Company : 3276
State : 62
ZIP code : 25962
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [10]:
for col in ['ZIP code','Company']:
    train.drop([col],1,inplace=True)
    test.drop([col],1,inplace=True)

In [11]:
train['Consumer disputed?']=np.where(train['Consumer disputed?']=="Yes",1,0)

In [12]:
k=train['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    train[varname]=np.where(train['Issue']==val,1,0)
    test[varname]=np.where(test['Issue']==val,1,0)
del train['Issue']
del test['Issue']

In [13]:
for col in train.select_dtypes(['object']).columns:
    print(col,':',train[col].nunique())

Product : 12
State : 62
Submitted via : 6
Company response to consumer : 7
Timely response? : 2


In [14]:
k=train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    train[varname]=np.where(train['State']==val,1,0)
    test[varname]=np.where(test['State']==val,1,0)
del train['State']
del test['State']

In [15]:
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    temp=pd.get_dummies(train[col],prefix=col,drop_first=True)
    train=pd.concat([temp,train],1)
    train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(test[col],prefix=col,drop_first=True)
    test=pd.concat([temp,test],1)
    test.drop([col],1,inplace=True)

# Logistic Regression

In [16]:
x=train.drop(['Consumer disputed?','Complaint ID'],1)
y=train['Consumer disputed?']

In [17]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':[.0001,.0005,.001,.005,.01,.05,.1,1,2,5]}

In [18]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':[.0001,.0005,.001,.005,.01]}

In [19]:
model=LogisticRegression()

In [20]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring="roc_auc",
                         n_jobs=-1,
                         verbose=20)

In [21]:
grid_search.fit(x,y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01],
                         'class_weight': ['balanced', None],
                         'penalty': ['l1', 'l2']},
             scoring='roc_auc', verbose=20)

In [22]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [23]:
report(grid_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.544694 (std: 0.002791)
Parameters: {'C': 0.0001, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 1
Mean validation score: 0.544694 (std: 0.002791)
Parameters: {'C': 0.0005, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 1
Mean validation score: 0.544694 (std: 0.002791)
Parameters: {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 1
Mean validation score: 0.544694 (std: 0.002791)
Parameters: {'C': 0.005, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 1
Mean validation score: 0.544694 (std: 0.002791)
Parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}



In [30]:
logistic_r=LogisticRegression(**{'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2'})

In [24]:
logistic=LogisticRegression(**{'C': 0.0001, 'class_weight': 'balanced', 'penalty': 'l2'})

In [31]:
logistic_r.fit(x,y)

LogisticRegression(C=0.001, class_weight='balanced')

In [32]:
prediction=np.where(logistic.predict(test.drop(['Complaint ID'],1))==1,"Yes","No")

In [33]:
Final_prediction=pd.DataFrame(list(zip(test['Complaint ID'],list(prediction))),
                       columns=['Complaint ID','Consumer disputed?'])

In [34]:
Final_prediction.head(4)

Unnamed: 0,Complaint ID,Consumer disputed?
0,675956,Yes
1,1858795,Yes
2,32637,No
3,1731374,No


In [36]:
Final_prediction.to_csv('sample_taj_submission.csv',index=False)