In [20]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder

In [105]:
df = pd.read_csv('train.csv')

In [106]:
df.head()

Unnamed: 0,customer_id,cust_age,job_title,marital_status,education,has_default_loan,cust_balance,has_mortgage_loan,has_personal_loan,contact_channel,contact_day,contact_month,contact_duration,campaign,pdays,previous_contacts,previous_outcome,is_subscribed
0,11218,48.0,blue-collar,single,,no,71.0,yes,no,,8,may,101,2,-1,0,,False
1,30655,60.0,pensioner,married,secondary,no,781.0,yes,yes,social-network,12,aug,128,5,-1,0,,False
2,24190,34.0,blue-collar,married,secondary,no,157.0,yes,no,social-network,11,jul,254,2,-1,0,,False
3,26270,50.0,manager,married,university-degree,no,29.0,no,yes,,23,jul,104,1,-1,0,,False
4,20989,60.0,manager,married,university-degree,no,3301.0,no,no,,17,jun,2621,3,-1,0,,True


In [107]:
df.isna().sum()

customer_id              0
cust_age               307
job_title              204
marital_status           0
education             1325
has_default_loan         0
cust_balance           373
has_mortgage_loan        0
has_personal_loan        0
contact_channel       9151
contact_day              0
contact_month            0
contact_duration         0
campaign                 0
pdays                    0
previous_contacts        0
previous_outcome     25889
is_subscribed            0
dtype: int64

In [108]:
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, f1_score
from sklearn.impute import KNNImputer

In [109]:
df[['cust_age','cust_balance']] = df[['cust_age','cust_balance']].astype(str)

df['cust_age'] = df['cust_age'].map(lambda df: df.rstrip('/r').rstrip('/'))
df['cust_balance']= df['cust_balance'].map(lambda df: df.rstrip('/r').lstrip('/'))

df['cust_balance'] = df['cust_balance'].str.extract('(\d+)', expand=False)
df['cust_balance']= df['cust_balance'].astype(float)

df['contact_day'] = df['contact_day'].astype(str)

# encoder=LabelEncoder()
# df['is_subscribed']=encoder.fit_transform(df['is_subscribed'])

In [110]:
df.head(2)

Unnamed: 0,customer_id,cust_age,job_title,marital_status,education,has_default_loan,cust_balance,has_mortgage_loan,has_personal_loan,contact_channel,contact_day,contact_month,contact_duration,campaign,pdays,previous_contacts,previous_outcome,is_subscribed
0,11218,48.0,blue-collar,single,,no,71.0,yes,no,,8,may,101,2,-1,0,,False
1,30655,60.0,pensioner,married,secondary,no,781.0,yes,yes,social-network,12,aug,128,5,-1,0,,False


In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31647 entries, 0 to 31646
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        31647 non-null  int64  
 1   cust_age           31647 non-null  object 
 2   job_title          31443 non-null  object 
 3   marital_status     31647 non-null  object 
 4   education          30322 non-null  object 
 5   has_default_loan   31647 non-null  object 
 6   cust_balance       31272 non-null  float64
 7   has_mortgage_loan  31647 non-null  object 
 8   has_personal_loan  31647 non-null  object 
 9   contact_channel    22496 non-null  object 
 10  contact_day        31647 non-null  object 
 11  contact_month      31647 non-null  object 
 12  contact_duration   31647 non-null  int64  
 13  campaign           31647 non-null  int64  
 14  pdays              31647 non-null  int64  
 15  previous_contacts  31647 non-null  int64  
 16  previous_outcome   575

In [112]:
df.isna().sum()

customer_id              0
cust_age                 0
job_title              204
marital_status           0
education             1325
has_default_loan         0
cust_balance           375
has_mortgage_loan        0
has_personal_loan        0
contact_channel       9151
contact_day              0
contact_month            0
contact_duration         0
campaign                 0
pdays                    0
previous_contacts        0
previous_outcome     25889
is_subscribed            0
dtype: int64

In [113]:
df['previous_outcome'].value_counts()

failure    3408
other      1286
success    1064
Name: previous_outcome, dtype: int64

In [114]:
df['job_title'] = df['job_title'].fillna('working-from-home')
df['education'] = df['education'].fillna('no-ed')
df['cust_balance'] = df['cust_balance'].fillna(np.mean(df['cust_balance']))
df['contact_channel'] = df['contact_channel'].fillna('no_contact')
df['previous_outcome'] = df['previous_outcome'].fillna('none_outcome')

In [23]:
df.columns

Index(['customer_id', 'cust_age', 'job_title', 'marital_status', 'education',
       'has_default_loan', 'cust_balance', 'has_mortgage_loan',
       'has_personal_loan', 'contact_channel', 'contact_day', 'contact_month',
       'contact_duration', 'campaign', 'pdays', 'previous_contacts',
       'previous_outcome', 'is_subscribed'],
      dtype='object')

In [115]:
numeric = ['cust_balance', 'campaign', 'contact_duration', 'pdays', 'previous_contacts']
categorical = ['marital_status', 'education', 'has_default_loan','job_title', 
               'has_mortgage_loan', 'previous_outcome', 'has_personal_loan', 
               'contact_channel', 'contact_day', 'contact_month']

In [116]:
X = df[numeric+categorical]
y = df['is_subscribed']

In [117]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31647 entries, 0 to 31646
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cust_balance       31647 non-null  float64
 1   campaign           31647 non-null  int64  
 2   contact_duration   31647 non-null  int64  
 3   pdays              31647 non-null  int64  
 4   previous_contacts  31647 non-null  int64  
 5   marital_status     31647 non-null  object 
 6   education          31647 non-null  object 
 7   has_default_loan   31647 non-null  object 
 8   job_title          31647 non-null  object 
 9   has_mortgage_loan  31647 non-null  object 
 10  previous_outcome   31647 non-null  object 
 11  has_personal_loan  31647 non-null  object 
 12  contact_channel    31647 non-null  object 
 13  contact_day        31647 non-null  object 
 14  contact_month      31647 non-null  object 
dtypes: float64(1), int64(4), object(10)
memory usage: 3.6+ MB


In [119]:
pipe = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), numeric),
            ('categorical', Pipeline(steps=[
                ('ohe', OneHotEncoder())
            ]), categorical),
        ])),
        ('classifier', CatBoostClassifier(verbose=False))
])
pipe.fit(X, y)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scale',
                                                                   StandardScaler())]),
                                                  ['cust_balance', 'campaign',
                                                   'contact_duration', 'pdays',
                                                   'previous_contacts']),
                                                 ('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder())]),
                                                  ['marital_status',
                                                   'education',
                                                   'has_default_loan',
                                                   'job_title',
              

In [77]:
## Catboost
sss = StratifiedKFold()
f1 = []
pipe = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), numeric),
            ('categorical', Pipeline(steps=[
                ('ohe', OneHotEncoder())
            ]), categorical),
        ])),
        ('classifier', CatBoostClassifier(verbose=False))
    ])
for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    pipe.fit(X_train, y_train)
    y_predicted=pipe.predict(X_test)
    f_score = f1_score(y_test, y_predicted)
    print(f_score)
    f1.append(f_score)
    
print(np.mean(f1))

0.5522151898734178
0.5567010309278351
0.5445935280189423
0.5369649805447471
0.5329153605015674
0.544678017973302


In [78]:
print(np.mean(f1))

0.544678017973302


### Load and predict on test  

In [120]:
test_df = pd.read_csv('test.csv')

In [121]:
test_df.shape

(13564, 17)

In [122]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13564 entries, 0 to 13563
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   customer_id        13564 non-null  int64 
 1   cust_age           13422 non-null  object
 2   job_title          13480 non-null  object
 3   marital_status     13564 non-null  object
 4   education          13032 non-null  object
 5   has_default_loan   13564 non-null  object
 6   cust_balance       13403 non-null  object
 7   has_mortgage_loan  13564 non-null  object
 8   has_personal_loan  13564 non-null  object
 9   contact_channel    9695 non-null   object
 10  contact_day        13564 non-null  int64 
 11  contact_month      13564 non-null  object
 12  contact_duration   13564 non-null  int64 
 13  campaign           13564 non-null  int64 
 14  pdays              13564 non-null  int64 
 15  previous_contacts  13564 non-null  int64 
 16  previous_outcome   2494 non-null   objec

In [123]:
test_df[['cust_age','cust_balance']] = test_df[['cust_age','cust_balance']].astype(str)

test_df['cust_age'] = test_df['cust_age'].map(lambda test_df: test_df.rstrip('/r').rstrip('/'))
test_df['cust_balance']= test_df['cust_balance'].map(lambda test_df: test_df.rstrip('/r').lstrip('/'))

test_df['cust_balance'] = test_df['cust_balance'].str.extract('(\d+)', expand=False)
test_df['cust_balance']= test_df['cust_balance'].astype(float)

test_df['contact_day'] = test_df['contact_day'].astype(str)

# encoder=LabelEncoder()
# test_df['is_subscribed']=encoder.fit_transform(test_df['is_subscribed'])

In [124]:
test_df['job_title'] = test_df['job_title'].fillna('working-from-home')
test_df['education'] = test_df['education'].fillna('no-ed')
test_df['cust_balance'] = test_df['cust_balance'].fillna(np.mean(test_df['cust_balance']))
test_df['contact_channel'] = test_df['contact_channel'].fillna('no_contact')
test_df['previous_outcome'] = test_df['previous_outcome'].fillna('none_outcome')

In [125]:
testX = test_df[numeric+categorical]

In [153]:
result = pipe.predict(testX)
result

array(['False', 'False', 'True', ..., 'False', 'False', 'True'],
      dtype=object)

In [154]:
result = pd.Series(result).apply(lambda x: True if x=='True' else False)

In [155]:
submission = pd.read_csv('sample_submission.csv')

In [156]:
submission['is_subscribed'] = result

In [160]:
submission.to_csv('submissions/elshan/submission_elshan_01.', index=False)

In [161]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13564 entries, 0 to 13563
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   customer_id    13564 non-null  int64
 1   is_subscribed  13564 non-null  bool 
dtypes: bool(1), int64(1)
memory usage: 119.3 KB


In [159]:
submission

Unnamed: 0,customer_id,is_subscribed
0,22154,False
1,31186,False
2,42151,True
3,14944,False
4,13657,False
...,...,...
13559,43181,False
13560,37261,False
13561,39647,False
13562,11882,False
