In [1]:
## Importing required libraries
import pandas as pd #for data preprocessing

#Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

#Linear Algebra
import numpy as np
 
#Import Datetime module
from datetime import datetime

from sklearn.model_selection import KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV, train_test_split #For splitting

#Evaluation Metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

#To ignore unnecessary warnings
import warnings

from sklearn.preprocessing import LabelEncoder,OneHotEncoder # for encoding categorical variables

In [2]:
#For Colab
#This is the path for of the folder where the datasets is stored in Drive
train = pd.read_csv("C:/Users/pc/Desktop/DSN/ai-bootcamp-2021/train.csv")
test = pd.read_csv("C:/Users/pc/Desktop/DSN/ai-bootcamp-2021/test.csv")

In [3]:
train.head(20)

Unnamed: 0,ID,Year_of_Birth,Education_Level,Marital_Status,Disposable_Income,No_of_Kids_in_home,No_of_Teen_in_home,Date_Customer,Recency,Discounted_Purchases,...,Amount_on_SweetProducts,Amount_on_GoldProds,WebVisitsMonth,Cmp3Accepted,Cmp4Accepted,Cmp5Accepted,Cmp1Accepted,Cmp2Accepted,Any_Complain,Response
0,ID_4A9AR6FQ,1955,Graduation,Divorced,77504.4,1,1,22-06-2014,56,2,...,6,20,3,0,0,0,0,0,0,0
1,ID_X28T3VEK,1958,Graduation,Together,56784.0,0,1,01-08-2013,17,6,...,15,19,6,0,0,0,0,0,0,0
2,ID_AWXARH57,1962,Graduation,Single,103714.8,0,0,21-11-2013,17,0,...,18,37,2,0,0,0,1,0,0,1
3,ID_FQVZHE81,1979,2n Cycle,Single,46311.6,1,0,13-09-2013,49,3,...,24,24,8,0,0,0,0,0,0,0
4,ID_QVLWGPRN,1959,Graduation,Married,87486.0,0,0,21-01-2014,59,1,...,165,203,1,0,0,0,1,0,0,0
5,ID_3S3HRGH6,1960,Master,Married,41090.4,0,1,23-03-2014,32,1,...,0,1,5,0,0,0,0,0,0,0
6,ID_KJQ9B2LE,1953,Graduation,Single,86673.6,0,0,27-11-2012,94,1,...,14,56,3,0,0,0,0,0,0,0
7,ID_6O0NBZXD,1974,Graduation,Together,88711.2,0,0,02-03-2013,61,1,...,91,15,3,0,0,0,0,0,0,0
8,ID_M3QFT8WO,1966,Graduation,Together,35606.4,1,1,12-03-2013,13,1,...,4,8,6,0,0,0,0,0,0,0
9,ID_745JV5PY,1975,Graduation,Married,41305.2,1,0,01-07-2013,88,1,...,2,9,7,0,0,0,0,0,0,0


In [4]:
#Create submission file using an empty/dummy dataframe
sub_file = pd.DataFrame(columns=['ID', 'Response'])
sub_file.head() 

Unnamed: 0,ID,Response


In [5]:
#Save test ID in a copy for creating submission file later
sub_file.ID = test.ID
sub_file.head()

Unnamed: 0,ID,Response
0,ID_ZPMABNVX,
1,ID_WFE91NAA,
2,ID_JV11RBRK,
3,ID_6B7SVKY9,
4,ID_GOVUZ545,


In [6]:
#Drop ID from train and test
train.drop('ID',axis=1, inplace=True)
test.drop('ID',axis=1, inplace=True)

In [7]:
train.drop(columns=['WebPurchases','CatalogPurchases'], inplace = True)
test.drop(columns=[ 'WebPurchases','CatalogPurchases'], inplace = True)

In [8]:
# We can separate catgorical and numerical column
#categ_cols == categorical columns . Date column excluded
categ_cols = train.select_dtypes(include=['object', 'category']).columns

# num column == numerical columns + date column included
num_cols = [col for col in train.columns if col not in categ_cols]
print(f'The Categorical Columns are: {categ_cols}')
print(f'The Numerical Columns are: {num_cols}')

The Categorical Columns are: Index(['Education_Level', 'Marital_Status', 'Date_Customer'], dtype='object')
The Numerical Columns are: ['Year_of_Birth', 'Disposable_Income', 'No_of_Kids_in_home', 'No_of_Teen_in_home', 'Recency', 'Discounted_Purchases', 'WebPurchases', 'StorePurchases', 'Amount_on_Wines', 'Amount_on_Fruits', 'Amount_on_MeatProducts', 'Amount_on_FishProducts', 'Amount_on_SweetProducts', 'Amount_on_GoldProds', 'WebVisitsMonth', 'Cmp3Accepted', 'Cmp4Accepted', 'Cmp5Accepted', 'Cmp1Accepted', 'Cmp2Accepted', 'Any_Complain', 'Response']


In [9]:
# label encoding 
le = LabelEncoder()
for i in categ_cols:
  train[i] = le.fit_transform(train[i])
  test[i] = le.fit_transform(test[i])
train.head()

Unnamed: 0,Year_of_Birth,Education_Level,Marital_Status,Disposable_Income,No_of_Kids_in_home,No_of_Teen_in_home,Date_Customer,Recency,Discounted_Purchases,WebPurchases,...,Amount_on_SweetProducts,Amount_on_GoldProds,WebVisitsMonth,Cmp3Accepted,Cmp4Accepted,Cmp5Accepted,Cmp1Accepted,Cmp2Accepted,Any_Complain,Response
0,1955,2,2,77504.4,1,1,430,56,2,1,...,6,20,3,0,0,0,0,0,0,0
1,1958,2,5,56784.0,0,1,12,17,6,5,...,15,19,6,0,0,0,0,0,0,0
2,1962,2,4,103714.8,0,0,418,17,0,7,...,18,37,2,0,0,0,1,0,0,1
3,1979,0,4,46311.6,1,0,258,49,3,4,...,24,24,8,0,0,0,0,0,0,0
4,1959,2,3,87486.0,0,0,402,59,1,3,...,165,203,1,0,0,0,1,0,0,0


In [10]:
X = train.drop('Response', axis=1)
y = train.Response

In [11]:
#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#import lightgbm as lgb #install this library using pip install lgbm if you don't have it 
from catboost import CatBoostClassifier #Install this Library using pip install catboost, before importing if you don't have it 
from xgboost import XGBClassifier

In [13]:
#Creating model 
cat_model= XGBClassifier()

#Training Catboost Model on train set
cat_model.fit(X_train,y_train)

#Predictiing on Test Set
y_pred_cat=cat_model.predict(X_test)

#Evaluating model using f1_score
print("Catboost F1 score on validation set is : ",f1_score(y_test,y_pred_cat))



Catboost F1 score on validation set is :  0.4571428571428571


In [16]:
#Now Predict on Test set
cat_preds = cat_model.predict(test)

In [17]:
#Save prediction to submission file created earlier
sub_file.Response = cat_preds
sub_file.head()

Unnamed: 0,ID,Response
0,ID_ZPMABNVX,0
1,ID_WFE91NAA,0
2,ID_JV11RBRK,0
3,ID_6B7SVKY9,0
4,ID_GOVUZ545,0


In [18]:
#Save to Csv for submission
sub_file.to_csv('C:/Users/pc/Desktop/DSN/ai-bootcamp-2021/submissionC.csv', index=False)