In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("Wine.csv")

In [None]:
df.head()

In [5]:
x = df.drop('Customer_Segment', axis=1)
y = df['Customer_Segment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt.score(x_test, y_test)

In [None]:
#Naive Aggregartion
from sklearn.ensemble import VotingClassifier
# estimators are list of tuples (name of algo, object of algo)
estimator_list = [('Logistic Reg', LogisticRegression()),
              ('DT gini', DecisionTreeClassifier()),
              ('DT entropy', DecisionTreeClassifier(criterion='entropy'))]
# Hard voting
hard = VotingClassifier(estimators=estimator_list)
hard.fit(x_train, y_train)
print("Hard Voting = ", hard.score(x_test, y_test))
#Soft Voting
soft = VotingClassifier(estimators=estimator_list, voting="soft")
soft.fit(x_train, y_train)
print("Soft Voting = ", soft.score(x_test, y_test))

In [None]:
df.shape

In [12]:
#Stacking
from mlxtend.classifier import StackingClassifier
#list base model
classifier_list = [DecisionTreeClassifier(), 
                   DecisionTreeClassifier(), 
                   DecisionTreeClassifier(),
                   DecisionTreeClassifier()]
#meta classifier
meta_class = LogisticRegression()
stack = StackingClassifier(classifiers=classifier_list, meta_classifier=meta_class, use_probas=True)
stack.fit(x_train, y_train)
stack.score(x_test, y_test)

0.9259259259259259

In [14]:
#Bootstrap Aggregation
from sklearn.ensemble import BaggingClassifier
#Bagging
bagging = BaggingClassifier(LogisticRegression(), n_estimators=4, max_samples=40)
bagging.fit(x_train, y_train)
print("Bagging with Logistic Regression = ", bagging.score(x_test, y_test))
bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=4, max_samples=40)
bagging.fit(x_train, y_train)
print("Bagging with Decision Tree = ", bagging.score(x_test, y_test))
#Pasting
pasting = BaggingClassifier(LogisticRegression(), n_estimators=4, max_samples=40, bootstrap=False)
pasting.fit(x_train, y_train)
print("pasting with Logistic Regression = ", pasting.score(x_test, y_test))
pasting = BaggingClassifier(DecisionTreeClassifier(), n_estimators=4, max_samples=40, bootstrap=False)
pasting.fit(x_train, y_train)
print("pasting with DT = ", pasting.score(x_test, y_test))

Bagging with Logistic Regression =  0.9074074074074074
Bagging with Decision Tree =  0.8333333333333334
pasting with Logistic Regression =  0.9259259259259259
pasting with DT =  0.8888888888888888


In [15]:
#Ada Boosting
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=LogisticRegression())
ada.fit(x_train, y_train)
ada.score(x_test, y_test)

0.9444444444444444

In [16]:
#Gradient Tree Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
gb.score(x_test, y_test)

0.9444444444444444

In [17]:
# pip install xgboost

In [18]:
from xgboost import XGBClassifier
xgb = XGBClassifier(base_estimator=LogisticRegression())
xgb.fit(x_train, y_train)
xgb.score(x_test, y_test)

0.9444444444444444

In [19]:
# pip install imbalanced-learn

In [20]:
data = pd.read_csv("paribas_data.csv")

In [21]:
data.shape

(12052, 133)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12052 entries, 0 to 12051
Columns: 133 entries, ID to v131
dtypes: float64(111), int64(3), object(19)
memory usage: 12.2+ MB


In [23]:
num_data = data.select_dtypes(['int64', 'float64'])
cat_data = data.select_dtypes(object)

In [26]:
cat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12052 entries, 0 to 12051
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v3      11704 non-null  object
 1   v22     11990 non-null  object
 2   v24     12052 non-null  object
 3   v30     5698 non-null   object
 4   v31     11704 non-null  object
 5   v47     12052 non-null  object
 6   v52     12052 non-null  object
 7   v56     11340 non-null  object
 8   v66     12051 non-null  object
 9   v71     12051 non-null  object
 10  v74     12051 non-null  object
 11  v75     12051 non-null  object
 12  v79     12051 non-null  object
 13  v91     12051 non-null  object
 14  v107    12051 non-null  object
 15  v110    12051 non-null  object
 16  v112    12001 non-null  object
 17  v113    6199 non-null   object
 18  v125    12040 non-null  object
dtypes: object(19)
memory usage: 1.7+ MB


In [27]:
cat_data.drop(['v30', 'v113'], axis=1, inplace=True)

In [34]:
for col in cat_data:
  cat_data[col].fillna(cat_data[col].mode()[0], inplace=True)

In [35]:
cat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12052 entries, 0 to 12051
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v3      12052 non-null  object
 1   v22     12052 non-null  object
 2   v24     12052 non-null  object
 3   v31     12052 non-null  object
 4   v47     12052 non-null  object
 5   v52     12052 non-null  object
 6   v56     12052 non-null  object
 7   v66     12052 non-null  object
 8   v71     12052 non-null  object
 9   v74     12052 non-null  object
 10  v75     12052 non-null  object
 11  v79     12052 non-null  object
 12  v91     12052 non-null  object
 13  v107    12052 non-null  object
 14  v110    12052 non-null  object
 15  v112    12052 non-null  object
 16  v125    12052 non-null  object
dtypes: object(17)
memory usage: 1.6+ MB


In [36]:
from sklearn.preprocessing import LabelEncoder
for col in cat_data:
  le = LabelEncoder()
  cat_data[col] = le.fit_transform(cat_data[[col]])
cat_data.head()

Unnamed: 0,v3,v22,v24,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v125
0,2,5077,2,0,2,6,59,2,2,1,3,4,0,4,1,14,21
1,2,2407,2,0,4,6,71,0,2,1,3,3,1,1,0,20,6
2,2,2205,4,0,2,5,12,0,0,1,1,4,6,2,1,18,5
3,2,487,3,1,2,7,33,0,2,1,3,1,1,1,1,9,63
4,2,2498,4,0,7,7,33,2,2,1,3,2,6,2,0,19,88


In [37]:
num_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12052 entries, 0 to 12051
Columns: 114 entries, ID to v131
dtypes: float64(111), int64(3)
memory usage: 10.5 MB


In [42]:
num_x = num_data.drop('target', axis=1)
matrix = num_x.corr()
final_features = set()
for i in range(len(matrix.columns)):
  for j in range(i):
    if abs(matrix.iloc[i, j]) > 0.8:
      colname = matrix.columns[i]
      final_features.add(colname)

In [43]:
len(final_features)

55

In [44]:
num_data.drop(labels=final_features, axis=True, inplace=True)

In [45]:
num_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12052 entries, 0 to 12051
Data columns (total 59 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      12052 non-null  int64  
 1   target  12052 non-null  int64  
 2   v1      6814 non-null   float64
 3   v2      6820 non-null   float64
 4   v4      6820 non-null   float64
 5   v5      6952 non-null   float64
 6   v6      6814 non-null   float64
 7   v7      6814 non-null   float64
 8   v8      6953 non-null   float64
 9   v9      6810 non-null   float64
 10  v10     12045 non-null  float64
 11  v11     6813 non-null   float64
 12  v13     6814 non-null   float64
 13  v14     12051 non-null  float64
 14  v15     6813 non-null   float64
 15  v16     6806 non-null   float64
 16  v17     6820 non-null   float64
 17  v18     6814 non-null   float64
 18  v19     6811 non-null   float64
 19  v20     6812 non-null   float64
 20  v23     6723 non-null   float64
 21  v26     6814 non-null   float64
 22

In [51]:
for col in  num_data:
  num_data[col].fillna(num_data[col].mean(), inplace=True)

In [52]:
new_data = pd.concat([cat_data, num_data], axis=1)

In [53]:
x = new_data.drop(['ID', 'target'], axis=1)
y = new_data['target']

In [54]:
from imblearn.ensemble import BalancedBaggingClassifier

In [55]:
bbg = BalancedBaggingClassifier(LogisticRegression(), bootstrap=True, n_estimators=50)

In [56]:
bbg.fit(x, y)

BalancedBaggingClassifier(base_estimator=LogisticRegression(C=1.0,
                                                            class_weight=None,
                                                            dual=False,
                                                            fit_intercept=True,
                                                            intercept_scaling=1,
                                                            l1_ratio=None,
                                                            max_iter=100,
                                                            multi_class='auto',
                                                            n_jobs=None,
                                                            penalty='l2',
                                                            random_state=None,
                                                            solver='lbfgs',
                                                            tol=0.0001,
                  

In [57]:
bbg.score(x, y)

0.6130932625290408