In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',300)

In [2]:
df = pd.read_csv('../input/santander-customer-satisfaction/train.csv') #nrows=10000

In [3]:
df.head()

**Dropping Constant Features**

In [4]:
from sklearn.feature_selection import VarianceThreshold
constant_features = VarianceThreshold(threshold=0.0)

In [5]:
constant_features.fit(df)
#checking the no of features exists, after dropping constant features
sum(constant_features.get_support())

In [6]:
constant_columns = [columns for columns in df.columns
                   if columns not in df.columns[constant_features.get_support()]]

len(constant_columns)

In [7]:
#dropping these constant columns from DataFrame
df.drop(constant_columns, axis=1, inplace=True)

In [8]:
df.shape

In [9]:
#features = [x for x in df.columns if x not in df.columns[quasi.get_support()]]

In [10]:
# percentage of observations showing each of the different values
df['ind_var1_0'].value_counts() / np.float(len(df))

print('Number of variables before removing constant: ', X_train.shape[1])

X_train = constant.transform(X_train)
X_test = constant.transform(X_test)

print('Number of variables after removing constant: ', X_train.shape[1])

In [11]:
pip install feature-engine --quiet

In [12]:
from feature_engine.selection import DropCorrelatedFeatures,DropConstantFeatures,DropDuplicateFeatures

In [13]:
# with tol=0.999 this will remove the features where the 99.9% values are sa,e only 0.01 percent uniqueness

quasi_constant = DropConstantFeatures(tol=0.998)
# find quasi-constant features in the train set
quasi_constant.fit(df)

In [14]:
len(quasi_constant.features_to_drop_)

In [15]:
print('Number of variables before removing quasi-constant: ', df.shape[1])

df = quasi_constant.transform(df)

print('Number of variables after removing quasi-constant: ', df.shape[1])

**Removing the similar or duplicated features**

**Some usefull**
fit:Find duplicated features.

fit_transform:Fit to data, then transform it.

get_feature_names_out:Get output feature names for transformation.

transform:Remove duplicated features.

In [16]:
drop_duplicates = DropDuplicateFeatures()
drop_duplicates.fit(df)

In [17]:
#this will give the identical features
drop_duplicates.duplicated_feature_sets_

#duplicates.features_to_drop_

In [18]:
#to check these features are equal or identical , we can use equals reslts true is same else false.
df['ind_var39'].equals(df['ind_var40'])

In [19]:
# checking the data in these two columns
df[['ind_var39','ind_var40']].head()

In [20]:
#dropping the dupliacte features
print('Number of variables before removing duplicates: ', df.shape[1])

df = drop_duplicates.transform(df)

print('Number of variables after removing duplicates: ', df.shape[1])

**Dropping Correlated Features**

In [21]:
related = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.85)

In [22]:
# finding correlated features in dataframe
related.fit(df)

In [23]:
related.correlated_feature_sets_

In [24]:
len(related.features_to_drop_)

In [25]:
print('Number of variables before removing correlated: ', df.shape[1])

df = related.transform(df)


print('Number of variables after removing correlated: ', df.shape[1])

In [26]:
X = df.drop('TARGET',axis=1)
y =df.iloc[:,-1]

**Resampling Using Smote**

In [27]:
from imblearn.over_sampling import SMOTE
sampling = SMOTE()

In [28]:
X_scaled, y_ = sampling.fit_resample(X,y)

In [29]:
#splitting
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y_, random_state=41, stratify = y_,shuffle=True)

**Model Building**

In [30]:
from sklearn.metrics import classification_report,plot_roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [31]:
models = {'forest' : RandomForestClassifier(),
          'boost' : GradientBoostingClassifier(),
          'xgboost' : XGBClassifier()}

In [32]:
for model_name,model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f'Classification report for {model_name}')
    print(classification_report(y_test,y_pred))
    print('')
    plot_roc_curve(model,X_test,y_test)

**Submission**

In [33]:
test = pd.read_csv('../input/santander-customer-satisfaction/test.csv')

In [34]:
xgboost = XGBClassifier()

test_ = pipe.fit(test)

In [None]:
probs = xgboost.predict_proba(test_)

final_df = pd.DataFrame({"ID":test_id, "TARGET": probs[:,1]})
probs.to_csv("submission.csv", index=False)