In [10]:
# Data manipulation
from sklearn.preprocessing import LabelEncoder
from scipy.stats import loguniform
import numpy as np
import pandas as pd
import os

# Data Visualazation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
%matplotlib inline

#consistent sized plots
from pylab import rcParams
rcParams['figure.figsize'] = 12,5
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['axes.titlesize'] = 14
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})

#handle unwanted warnings
import warnings
warnings.filterwarnings('ignore')

# Machine learning
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm import LGBMModel,LGBMClassifier
import catboost as cb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier

# Model Evaluations
import sklearn as sk
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score

#no constraint on the data columns to be displayed in jupyter notebook
pd.options.display.max_columns = None

# Handle Date & Time
from datetime import datetime

# Handle zip files
import zipfile

# Print module versions for reproducibility
print('CatBoost version {}'.format(cb.__version__))
print('NumPy version {}'.format(np.__version__))
print('Pandas version {}'.format(pd.__version__))
print('seaborn version {}'.format(sns.__version__))
print('sklearn version {}'.format(sk.__version__))
print('xgboost version {}'.format(xgb.__version__))
print('lightgbm version {}'.format(lgb.__version__))

# User Defined Functions

# a) BoxPlot:- To check Outliers
def Create_BoxPlotS(DataFrame, Features, Rows, Columns):
    fig=plt.figure(figsize=(20,20))
    for i, Feature in enumerate(Features[0:]):
        ax=fig.add_subplot(Rows,Columns,i+1)
        sns.boxplot(DataFrame[Feature],ax=ax)   
    fig.tight_layout()  
    plt.show()
    
# b) DistPlot:- To check Normality
def Create_DistPlots(DataFrame, Features, Rows, Columns):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(Features[0:]):
        ax=fig.add_subplot(Rows,Columns,i+1)
        sns.distplot(DataFrame[feature],ax=ax)    
    fig.tight_layout()  
    plt.show()

# c) Univariate Analysis of Catagorical Variables
def Univariate_Cat_Features(DataFrame, Features, Rows, Columns):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(Features[0:]):
        ax=fig.add_subplot(Rows,Columns,i+1)
        sns.countplot(DataFrame[feature],ax=ax)    
    fig.tight_layout()  
    plt.show()
    
# d) Scatter Plot:- Bivariate Analysis between Target Variable & Independent Variables
# Note:- Pass Target_Variable in Single Quote in this function
def Create_ScatterPlots(DataFrame, Features, Target_Variable, Rows, Columns):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(Features[0:]):
        ax=fig.add_subplot(Rows,Columns,i+1)
        sns.scatterplot(DataFrame[feature],DataFrame[Target_Variable],ax=ax)  
    fig.tight_layout()  
    plt.show()

# e) Joint Plot:- Bivariate Analysis between Target Variable & Independent Variables
# Note:- Pass Target_Variable in Single Quote in this function
def Create_JointPlots(DataFrame, Features, Target_Variable):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(Features[0:]):
        sns.jointplot(y= Target_Variable, x= DataFrame[feature], data = DataFrame,kind='reg')
    fig.tight_layout()  
    plt.show()

# f) Check Feature wise Outliers   
def Check_Outliers(DataFrame, Column_List):
    for column in Column_List:    
        q1 = DataFrame[column].quantile(0.25)    # First Quartile
        q3 = DataFrame[column].quantile(0.75)    # Third Quartile
        IQR = q3 - q1                            # Inter Quartile Range
        llimit = q1 - 1.5*IQR                    # Lower Limit
        ulimit = q3 + 1.5*IQR                    # Upper Limit
        outliers = DataFrame[(DataFrame[column] < llimit) | (DataFrame[column] > ulimit)]
        print('\nNumber of outliers in "' + column + '" :' + str(len(outliers)))
        print('\nLower Limit in "' + column + '" :',llimit)
        print('\nUpper Limit in "' + column + '" :',ulimit)
        print('\nInter Quartile Range of "' + column + '" :',IQR,"\n")

# g) Label Encoding   
def Label_Encoder(DataFrame):
    le = LabelEncoder()
    for col in DataFrame.columns:
        if(DataFrame[col].dtype == 'object'):
            DataFrame.loc[:,col] = le.fit_transform(DataFrame.loc[:,col])
            
            
train = pd.read_csv (r'C:\Users\1014070\Favorites\AB\TG\HCP-June23\Train1.csv',encoding='latin-1')
test = pd.read_csv (r'C:\Users\1014070\Favorites\AB\TG\HCP-June23\Test1.csv',encoding='latin-1')
#test1 = pd.read_csv (r'C:\Users\1014070\Favorites\AB\TG\HCP-June23\Test.csv',encoding='latin-1')
submission = pd.read_csv (r'C:\Users\1014070\Favorites\AB\TG\HCP-June23\Sample_Submission.csv',encoding='latin-1')


train['USERCITY'] = train['USERCITY'].fillna(train['USERCITY'].mode()[0],inplace=True)
test['USERCITY'] = test['USERCITY'].fillna(test['USERCITY'].mode()[0],inplace=True)
train['USERZIPCODE'] = train['USERZIPCODE'].fillna(train['USERZIPCODE'].mode()[0],inplace=True)
test['USERZIPCODE'] = test['USERZIPCODE'].fillna(test['USERZIPCODE'].mode()[0],inplace=True)
train['USERAGENT'] = train['USERAGENT'].fillna(train['USERAGENT'].mode()[0],inplace=True)
test['USERAGENT'] = test['USERAGENT'].fillna(test['USERAGENT'].mode()[0],inplace=True)

train.loc[(train['IS_HCP'].isnull()),'IS_HCP'] = 0

Label_Encoder(train)
Label_Encoder(test)

CatBoost version 1.2
NumPy version 1.21.5
Pandas version 1.4.2
seaborn version 0.11.2
sklearn version 1.0.2
xgboost version 1.7.6
lightgbm version 3.3.5


In [11]:
#train = train.drop(['TAXONOMY'],axis=1) #S1


train = train.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
                    ,'USERCITY','USERZIPCODE','USERAGENT'
                    ,'PLATFORMTYPE','CHANNELTYPE','TAXONOMY'
                    ,'URL','USERPLATFORMUID'],axis=1)

test = test.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
                    ,'USERCITY','USERZIPCODE','USERAGENT'
                    ,'PLATFORMTYPE','CHANNELTYPE'
                    ,'URL','USERPLATFORMUID'],axis=1)
# ##############################  S2     #############################
# train = train.drop(['TAXONOMY','URL'],axis=1)

# test = test.drop(['URL'],axis=1)


##############################  S3    #############################

# train = train.drop(['TAXONOMY','URL','USERAGENT'],axis=1)

# test = test.drop(['URL','USERAGENT'],axis=1)

##############################  S4    #############################

# train = train.drop(['TAXONOMY','URL','USERPLATFORMUID'],axis=1)

# test = test.drop(['URL','USERPLATFORMUID'],axis=1)

##############################  S5    #############################

# train = train.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERCITY','USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE','TAXONOMY'
#                     ,'URL','USERPLATFORMUID'],axis=1)

# test = test.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERCITY','USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE'
#                     ,'URL','USERPLATFORMUID'],axis=1)

# THIS IS BASE REFERENCE
##############################  S6    #############################

# train = train.drop(['PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERCITY','USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE','TAXONOMY'
#                     ,'URL','USERPLATFORMUID'],axis=1)

# test = test.drop(['PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERCITY','USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE'
#                     ,'URL','USERPLATFORMUID'],axis=1)

#DEVICETYPE:- Not Giving much impact

##############################  S7    #############################

# train = train.drop(['BIDREQUESTIP'
#                     ,'USERCITY','USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE','TAXONOMY'
#                     ,'URL','USERPLATFORMUID'],axis=1)

# test = test.drop(['BIDREQUESTIP'
#                     ,'USERCITY','USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE'
#                     ,'URL','USERPLATFORMUID'],axis=1)

##############################  S8    #############################

# train = train.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE','TAXONOMY'
#                     ,'URL','USERPLATFORMUID'],axis=1)

# test = test.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERZIPCODE','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE'
#                     ,'URL','USERPLATFORMUID'],axis=1)
# RMVD "USERCITY"

##############################  S9    #############################

# train = train.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERCITY','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE','TAXONOMY'
#                     ,'URL','USERPLATFORMUID'],axis=1)

# test = test.drop(['DEVICETYPE','PLATFORM_ID','BIDREQUESTIP'
#                     ,'USERCITY','USERAGENT'
#                     ,'PLATFORMTYPE','CHANNELTYPE'
#                     ,'URL','USERPLATFORMUID'],axis=1)

#RMVD "USERZIPCODE"

In [12]:
X = train.drop(['IS_HCP'],axis=1)
y = train.IS_HCP

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.11,shuffle = True) #94.7336

model33 = CatBoostClassifier(random_state=0
                            , depth = 10, iterations = 100, learning_rate = 0.042134,logging_level='Silent') 
model33.fit(X_train,y_train) 


<catboost.core.CatBoostClassifier at 0x1ad44a94790>

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.12,shuffle = True) #94.9897

model334 = CatBoostClassifier(random_state=0
                            , depth = 10, iterations = 100, learning_rate = 0.042134,logging_level='Silent') 
model334.fit(X_train,y_train) 


<catboost.core.CatBoostClassifier at 0x1ad4cab3040>

In [16]:
# Import Test data for the prediction of the Target Variable
x = np.array(test)

# Prediction using Model 1
y_pred1 = model33.predict_proba(x)

# Prediction using Model 2
y_pred2 = model334.predict_proba(x)

predictions = list()
cb_weight=0.60 
lb_weight=0.40 

for i, j in zip(y_pred1, y_pred2):
    xx = [(cb_weight * i[0]) + (lb_weight * j[0]),
          (cb_weight * i[1]) + (lb_weight * j[1])]
    predictions.append(xx)

preds_ensemble=np.argmax(predictions,axis=1)

y_pred = preds_ensemble #95.362

print("Result of Ensemble Technique used for Submission")

# Save in Dataframe
df1=pd.DataFrame(y_pred,columns=['IS_HCP']); print(type(df1))

submission.head(); submission1 = submission.drop(['IS_HCP'], axis=1); submission.head()
final_pred = pd.concat([submission1,df1], axis=1); final_pred.head()
final_pred.to_csv(r'C:\Users\1014070\Favorites\AB\TG\HCP-June23\Ensemb.csv',index=False)
print("Process Completed")

Result of Ensemble Technique used for Submission
<class 'pandas.core.frame.DataFrame'>
Process Completed
