In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/marketing-strategy-personalised-offer/sample.csv
/kaggle/input/marketing-strategy-personalised-offer/train_data.csv
/kaggle/input/marketing-strategy-personalised-offer/test_data.csv


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train_data = pd.read_csv('../input/marketing-strategy-personalised-offer/train_data.csv')
test_data = pd.read_csv('../input/marketing-strategy-personalised-offer/test_data.csv')

In [4]:
X = train_data.iloc[:,:-1]
y = train_data.iloc[:,[-1]]

In [5]:
from sklearn.preprocessing import LabelEncoder
le  = LabelEncoder()
le.fit(['No','Yes'])
y = pd.DataFrame(le.transform(y),columns=y.columns)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
vt = VarianceThreshold()
ohe = OneHotEncoder()
oe = OrdinalEncoder(categories=[['never','less1','1~3','4~8','gt8']], handle_unknown='use_encoded_value', unknown_value=np.nan)
oe1 = OrdinalEncoder(categories=[['Less than ₹12500','₹12500 - ₹24999','₹25000 - ₹37499','₹37500 - ₹49999','₹50000 - ₹62499','₹62500 - ₹74999','₹75000 - ₹87499','₹87500 - ₹99999','₹100000 or More']])
oe2 = OrdinalEncoder(categories=[['below21','21','26','31','36','41','46','50plus']])
ss = StandardScaler()
si = SimpleImputer(strategy='most_frequent')
pt = 'passthrough'
p0 = Pipeline([('oe', oe), ('si', si)])
ct = ColumnTransformer([('ss1',ss,[0]),('oe1',oe1,[1]),('p01',p0,[2]),('pt1',pt,[3]),('p02',p0,[4]),('ohe1',ohe,[5,6]),('oe2',oe2,[7]),
                        ('pt2',pt,[8,9,10]),('p03',p0,[11]),('ohe2',ohe,[12,13]),('pt3',pt,[14,15]),('ohe3',ohe,[16,17]),('pt4',pt,[18]),
                        ('p04',p0,[19]),('ohe4',ohe,[20]),('pt5',pt,[21,22,23]),('ss2',ss,[24]),('p05',p0,[25]),('ss3',ss,[26]),('ohe5',ohe,[27,28])
                       ], remainder='passthrough')
pl = Pipeline([('ct', ct), ('vt', vt)])

In [7]:
def preprocess(data):
    data['offer expiration'].replace('10hours', 10, inplace=True)
    data['offer expiration'].replace('2days', 48, inplace=True)
    data = pd.DataFrame(pl.fit_transform(data))
    return data

In [8]:
X = preprocess(X)
test_data = preprocess(test_data)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=124)

In [10]:
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [11]:
def val_score(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    print('f1_score : ', f1_score(y_val,y_pred), ' [ Validation Score ]')

In [12]:
def cv(model,parameters):
    gscv = GridSearchCV(model, parameters, scoring='f1', n_jobs=-1, cv=10)
    gscv.fit(X,y)
    print('GridSearchCV')
    print('-> best_estimator_ :', gscv.best_estimator_)
    print('-> best_score_ :', gscv.best_score_)

In [13]:
from sklearn.dummy import DummyClassifier
dc = DummyClassifier(strategy='most_frequent')
val_score(dc)
# cv(dc,{'strategy':['most_frequent','prior','stratified','uniform']})

f1_score :  0.7309072270630447  [ Validation Score ]


In [14]:
from sklearn.linear_model import RidgeClassifier
rc = RidgeClassifier(alpha=3.6,tol=1e-1,normalize=True,random_state=124)
val_score(rc)
# cv(rc,{'alpha':[*np.arange(0,11,0.1)],'fit_intercept':[True,False],'normalize':[True,False],'class_weight':['balanced',None],'solver':['auto','svd','cholesky','lsqr','sparse_cg','sag','saga','lbfgs'],'positive':[True,False]})

f1_score :  0.7349524815205914  [ Validation Score ]


In [15]:
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier(alpha=3,random_state=124,n_jobs=-1)
val_score(sgdc)
# cv(sgdc,{'loss':['hinge','log_loss','log','modified_huber','squared_hinge','perceptron','squared_error','huber','epsilon_insensitive','squared_epsilon_insensitive'],'penalty':['l1','l2','elasticnet']})

f1_score :  0.7302564102564102  [ Validation Score ]


In [16]:
from sklearn.linear_model import Perceptron
p = Perceptron(alpha=1e-3,tol=1e-1,penalty='elasticnet',l1_ratio=0.6,random_state=124,n_jobs=-1,warm_start=True)
val_score(p)
# cv(p,{'penalty':['l1','l2','elasticnet',None],'l1_ratio':[*np.arange(0,1.1,0.1)],'fit_intercept':[True,False],'class_weight':['balanced',None]})

f1_score :  0.6872451951077461  [ Validation Score ]


In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(tol=1e-1,penalty='l1',solver='saga',random_state=124,n_jobs=-1,warm_start=True)
val_score(lr)
# cv(lr,{'penalty':['l1','l2','elasticnet',None],'dual':[True,False],'fit_intercept':[True,False],'class_weight':['balanced',None],'solver':['newton-cg','lbfgs','liblinear','sag','saga']})

f1_score :  0.6971569839307789  [ Validation Score ]


In [18]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=0.4, binarize=0.8)
val_score(bnb)
# cv(bnb,{'alpha':[*np.arange(0,1.1,0.1)],'binarize':[*np.arange(0,1.1,0.1)],'fit_prior':[True,False]})

f1_score :  0.6776099803020355  [ Validation Score ]


In [19]:
from sklearn.svm import SVC
svc = SVC(C=0.1,tol=1e-1,kernel='poly',degree=0,gamma='scale',random_state=124)
val_score(svc)
# cv(svc,{'C':[*np.arange(0,1.1,0.1)],'kernel':['linear','poly','rbf','sigmoid'],'degree':[0,1,2,3],'gamma':['scale','auto']})

f1_score :  0.7309072270630447  [ Validation Score ]


In [20]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=750,weights='distance',p=1,n_jobs=-1)
val_score(knc)
# cv(knc,{'n_neighbors':[*range(746,756)],'weights':['uniform','distance'],'p':[0,1,2]})

f1_score :  0.735262593783494  [ Validation Score ]


In [21]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_features='auto',ccp_alpha=0.1,random_state=124)
val_score(dtc)
# cv(dtc,{'criterion':['gini','entropy','log_loss'],'splitter':['best','random'],'max_features':['auto','sqrt','log2'],'class_weight':[None,'balanced'],'ccp_alpha':[*np.arange(0,1.1,0.1)]})

f1_score :  0.7309072270630447  [ Validation Score ]


In [22]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
val_score(rfc)
# cv(rfc,{'ccp_alpha':[*np.arange(0,1.1,0.1)],'criterion':['gini','entropy','log_loss'],'max_features':[None,'sqrt','log2'],'class_weight':[None,'balanced','balanced_subsample']})

f1_score :  0.7073631214600378  [ Validation Score ]


In [23]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(C=1,fit_intercept=False,random_state=124,n_jobs=-1,warm_start=True)
val_score(pac)
# cv(pac,{'C':[*range(0,11,1)],'fit_intercept':[True,False],'class_weight':['balanced',None],'warm_start':[True,False]})

f1_score :  0.7188841201716738  [ Validation Score ]


In [24]:
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier(activation='logistic',learning_rate='invscaling',solver='sgd',random_state=124)
val_score(mlpc)
# cv(mlpc,{'activation':['identity','logistic','tanh','relu'],'solver':['lbfgs','sgd','adam'],'learning_rate':['constant','invscaling','adaptive']})

f1_score :  0.7309072270630447  [ Validation Score ]


In [25]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(ccp_alpha=0.2,loss='exponential',random_state=124,warm_start=True)
val_score(gbc)
# cv(gbc,{'loss':['log_loss','deviance','exponential'],'criterion':['friedman_mse','squared_error','mse'],'warm_start':[True,False]})

f1_score :  0.7309072270630447  [ Validation Score ]


In [26]:
# from sklearn.ensemble import BaggingClassifier
# rc_bc = BaggingClassifier(rc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(rc_bc)
# sgdc_bc = BaggingClassifier(sgdc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(sgdc_bc)
# svc_bc = BaggingClassifier(svc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(svc_bc)
# knc_bc = BaggingClassifier(knc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(knc_bc)
# dtc_bc = BaggingClassifier(dtc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(dtc_bc)
# rfc_bc = BaggingClassifier(rfc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(rfc_bc)
# mlpc_bc = BaggingClassifier(mlpc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(mlpc_bc)
# gbc_bc = BaggingClassifier(gbc,random_state=124,n_jobs=-1,warm_start=True)
# val_score(gbc_bc)

In [27]:
rfc.fit(X,y)

RandomForestClassifier()

In [28]:
pd.DataFrame(le.inverse_transform(rfc.predict(test_data)),columns=['Offer Accepted']).rename_axis('id').reset_index().to_csv('Submission.csv',index=False)