In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import keras
from pandas.plotting import scatter_matrix
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import EditedNearestNeighbours

from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier

from pympler import tracker
import pickle

In [2]:
import tensorflow as tf
print(tf.__version__)

2.16.1


### Loading data

In [3]:
df=pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,ind_recommended,activation,customer_digital_activity_04,customer_spend_01,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_spend_02,...,merchant_spend_09,merchant_profile_03,customer_digital_activity_01,merchant_spend_10,customer_profile_03,customer_digital_activity_02,customer_profile_04,distance_05,customer,merchant
0,0,0,,107.215862,26.686594,74.0,3682.75,138.0,111.0,14.0,...,49466.0,65923.0,0.0,29.18,58.434969,32.5,86.0,15.856826,168972,152285
1,0,0,,35.552,50.928261,3.0,1171.35,23.0,17.0,2.0,...,3638.0,7801.0,0.419355,28.465,5.392089,7.0,125.0,6.998555,212404,39032
2,0,0,,31.623103,48.837872,19.0,2295.38,47.0,42.0,11.0,...,3912.0,12868.0,0.836364,421.5,33.780445,0.0,180.0,1.753009,225178,7439
3,0,0,,112.277391,,,,,,16.0,...,28919.0,23553.0,0.952381,50.0,37.340085,28.666667,134.0,9.000063,183948,485069
4,1,0,,448.427273,,,,,,5.0,...,1086.0,308.0,0.754386,69.509,77.794164,15.0,114.0,1.767939,210107,536004


In [5]:
size = df.shape
valid_counts = df.count()

column_of_interest = 'activation'

# Get the unique values and the number of unique values
unique_values = df[column_of_interest].unique()
value_counts = df[column_of_interest].value_counts()

print("Value counts for column '{}':\n{}".format(column_of_interest, value_counts))

Value counts for column 'activation':
activation
0    12159962
1       70016
Name: count, dtype: int64


### Preprocessing data


#### Drop rows
major issue of missing values in multiple entires. if eliminate all NA values, the data will have small size\
drop rows with more than half the features being NA 

In [None]:
drop_na  = df.dropna(thresh=len(df.columns)/2)
del df # save memories
print ('After dropping those with more than half the data dimension being na, the size is:', drop_na.shape)

After dropping those with more than half the data dimension being na, the size is: (7822511, 71)


#### Drop columns
customer_merchant_01, customer_merchant_02, customer_digital_activity_01, 02, 07, 08, 09, 18\
Assumption: they do not contribute significantly to the recommendation

In [None]:
y_train = drop_na.activation
columns_to_drop = []
for col in drop_na.columns:
    if drop_na[col].count() < size[0]*0.1:
        columns_to_drop.append(col)

data_train = drop_na.drop(columns_to_drop, axis=1)


del drop_na


In [9]:
X_train = data_train.drop(['ind_recommended', 'activation', 
                           #'customer', 'merchant'
                           ],axis=1)

del data_train

##### Serialize for generation of results (without imputation and undersampling)

#### Imbalanced UnderSampling: 
Choice of undersampling technique: Random Undersampling\
https://imbalanced-learn.org/stable/under_sampling.html

In [8]:
#   EditedN
##  enn = EditedNearestNeighbours()
#   X_resampled, y_resampled = enn.fit_resample(X_train, y_train)

##  renn = RepeatedEditedNearestNeighbours()
#   X_resampled, y_resampled = renn.fit_resample(X_train, y_train)
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

class_count = y_resampled.value_counts()
X_resampled.shape, class_count

((134120, 61),
 activation
 0    67060
 1    67060
 Name: count, dtype: int64)

In [9]:
knn_imputer = KNNImputer(n_neighbors=5)  # Set the number of neighbors as per your dataset
X_imputed = knn_imputer.fit_transform(X_resampled)

In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_resampled, test_size=0.2, random_state=42)

In [10]:
nan_count = np.isnan(X_imputed).sum()
print("Number of NaN values in the array: ", nan_count)

Number of NaN values in the array:  0


#### Serialize for training data use

In [18]:
header = ['customer_digital_activity_04',
          'customer_spend_01',
          'customer_industry_spend_01',
          'customer_industry_spend_02',
          'customer_industry_spend_03',
          'customer_industry_spend_04',
          'customer_industry_spend_05',
          'customer_spend_02',
          'customer_spend_03',
          'customer_spend_04', 
          'customer_spend_05',
          'customer_spend_06', 
          'customer_spend_07',
          'merchant_spend_01',
          'merchant_spend_02', 
          'merchant_spend_03', 
          'merchant_spend_04', 
          'merchant_spend_05', 
          'merchant_spend_06', 
          'merchant_spend_07',  
          'merchant_spend_08', 
          'merchant_profile_01',
          'customer_merchant_03',
          'customer_profile_01',
           'customer_profile_02',
           'customer_digital_activity_05',
           'customer_spend_13',
           'customer_digital_activity_06',
           'customer_spend_14',
           'customer_digital_activity_10',
           'customer_digital_activity_11',
           'customer_digital_activity_12',
           'customer_digital_activity_13',
           'customer_digital_activity_14',
           'customer_digital_activity_15',
           'customer_spend_15',
           'customer_digital_activity_16',
           'customer_spend_16',
           'customer_spend_17',
           'customer_digital_activity_17',
           'customer_digital_activity_03',
           'merchant_spend_11',
           'customer_digital_activity_19',
           'distance_01',
           'customer_digital_activity_20',
           'distance_02',
           'distance_03',
           'customer_spend_18',
           'customer_spend_19',
           'customer_digital_activity_21',
           'customer_digital_activity_22',
           'distance_04',
           'merchant_profile_02',
           'merchant_spend_09',
           'merchant_profile_03',
           'merchant_spend_10',
           'customer_profile_03',
           'customer_profile_04',
           'distance_05'
]
X_train = pd.DataFrame(X_train, columns = header)
X_test = pd.DataFrame(X_test, columns = header)

#customer_merchant_01, customer_merchant_02, 
#customer_digital_activity_01, 02, 07, 08, 09, 18

In [67]:
# serialize the train and test data ready for continued use training models 
data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
with open('amex_training.pickle', 'wb') as file:
    pickle.dump(data, file)

In [3]:
# load the data (run this everytime the kernel is restarted)
with open('amex_training.pickle', 'rb') as file:
    data = pickle.load(file)
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']
   

In [None]:
# Assuming X_resampled is the undersampled feature matrix

# Define the parameter grid for number of neighbors
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}  

# Instantiate the KNNImputer
knn_imputer = KNNImputer()

# Perform grid search to find the optimal number of neighbors
grid_search = GridSearchCV(knn_imputer, param_grid, scoring='roc_auc', cv=5)
grid_search.fit(X_resampled, y_resampled)

# Best number of neighbors
best_neighbors = grid_search.best_params_['n_neighbors']
print(f"Optimal number of neighbors: {best_neighbors}")

In [19]:
## Standardscaler for distance-based algorithm
def scale_features(X_train, X_test):
    sc = StandardScaler()
    X_train_ss = sc.fit_transform(X_train)
    X_test_ss = sc.transform(X_test)
    return X_train_ss, X_test_ss
X_train_ss, X_test_ss = scale_features(X_train, X_test)

X_train_ss = pd.DataFrame(X_train_ss, columns = header)
X_test_ss = pd.DataFrame(X_test_ss, columns = header)

### Binary Classifier

#### Random Forest

In [73]:
## RandomForest Classifier
RF = RandomForestClassifier(random_state=42)
RF.fit(X_train, y_train)
y_pred_test_rf = cross_val_predict(RF, X_test, y_test, cv=5, method='predict')
roc_auc_score(y_test, y_pred_test_rf), confusion_matrix(y_test, y_pred_test_rf)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


(0.8161630505467493,
 array([[10963,  2539],
        [ 2393, 10929]], dtype=int64))

#### SVC

In [15]:
# SVC: non standardized
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
y_pred_test_svm = cross_val_predict(svm, X_test, y_test, cv=5, method = "predict")
roc_auc_score(y_test, y_pred_test_svm), confusion_matrix(y_test, y_pred_test_svm)


(0.7036878454522221,
 array([[9597, 3905],
        [4042, 9280]], dtype=int64))

In [7]:
# SVC: distanced based so we use X_train_ss
svm = SVC(random_state=42)
svm.fit(X_train_ss, y_train)
y_pred_test_svm = cross_val_predict(svm, X_test_ss, y_test, cv=5, method = "predict")
roc_auc_score(y_test, y_pred_test_svm), confusion_matrix(y_test, y_pred_test_svm)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.8009846289654308,
 array([[10694,  2808],
        [ 2532, 10790]], dtype=int64))

#### KNN

In [None]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5) 
knn.fit(X_train, y_train)  
y_pred_test_knn = cross_val_predict(knn, X_test, y_test, cv=5, method='predict')
roc_auc_score(y_test, y_pred_test_knn), confusion_matrix(y_test, y_pred_test_knn)

(0.654216762295648,
 array([[9155, 4347],
        [4924, 8398]], dtype=int64))

In [22]:
# KNN Classifier: Standardized
knn = KNeighborsClassifier(n_neighbors=5) 
knn.fit(X_train_ss, y_train)  
y_pred_test_knn = cross_val_predict(knn, X_test_ss, y_test, cv=5, method='predict')
roc_auc_score(y_test, y_pred_test_knn), confusion_matrix(y_test, y_pred_test_knn)

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


(0.7315345265368616,
 array([[9751, 3751],
        [3452, 9870]], dtype=int64))

#### Ensemble: Voting

In [9]:
# Voting Classifier with all exisiting classifier - hard voting
voting_clf = VotingClassifier(
    estimators=[
        #('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(random_state=42)),
        ('knn', KNeighborsClassifier(n_neighbors=5))
    ]
)
voting_clf.fit(X_train_ss, y_train)

for name, clf in voting_clf.named_estimators_.items():
    print (name, "=", clf.score(X_test_ss, y_test))
voting_clf.score(X_test_ss, y_test)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


rf = 0.8215404115717269
svc = 0.8095735162541008
knn = 0.7458246346555324


0.8136743215031316

In [10]:
# Soft voting
voting_clf.voting = "soft"
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train_ss, y_train)
voting_clf.score(X_test_ss, y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


0.81080375782881

#### RandomForest GridSearch

In [40]:
# Load the dataset and split into features and target variable
# Assuming X and y are the features and target variable

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Instantiate the Random Forest Classifier
rf = RandomForestClassifier()

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Retrieve the Best Parameters
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Evaluate the Model
y_pred = best_estimator.predict(X_test)
# Additional evaluation metrics and analysis

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [8]:
best_params, best_estimator

({'criterion': 'gini',
  'max_depth': None,
  'min_samples_split': 5,
  'n_estimators': 300},
 RandomForestClassifier(min_samples_split=5, n_estimators=300))

In [9]:
# serialize the best y, estimator, params ready for continued use training models 
params = {'y_pred': y_pred, 'best_estimator': best_estimator, 'best_params': best_params}
with open('amex_GridsearchRF.pickle', 'wb') as file:
    pickle.dump(params, file)

In [6]:
# RF: Best y, parameter, and estimator
roc_auc_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

(0.823046399171187,
 array([[10933,  2569],
        [ 2180, 11142]], dtype=int64))

#### SVC GridSearchCV

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

# Instantiate the Support Vector Classifier (SVC)
svc = SVC()

# Perform Grid Search
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_ss, y_train)

# Retrieve the Best Parameters
best_params_svc = grid_search.best_params_
best_estimator_svc = grid_search.best_estimator_

# Evaluate the Model
y_pred_svc = best_estimator_svc.predict(X_test_ss)
# Additional evaluation metrics and analysis
roc_auc_score(y_test, y_pred_svc), confusion_matrix(y_test, y_pred_svc), best_params_svc, best_estimator_svc


### Neural Network

#### MLP Neural Network 

##### Non-Standardized

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline

mlp_clf = MLPClassifier(hidden_layer_sizes=[5], max_iter=10_000,
                        random_state=42)
pipeline = make_pipeline(StandardScaler(), mlp_clf)
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
accuracy

  y = column_or_1d(y, warn=True)


0.8118475991649269

##### Standardized

In [21]:
mlp_clf = MLPClassifier(hidden_layer_sizes=[5], max_iter=10_000,
                        random_state=42)
pipeline = make_pipeline(StandardScaler(), mlp_clf)
pipeline.fit(X_train_ss, y_train)
accuracy = pipeline.score(X_test_ss, y_test)
accuracy

  y = column_or_1d(y, warn=True)


0.8118475991649269

#### Keras Tuner network

In [2]:
import keras_tuner as kt

def build_model(hp):
    n_hidden = hp.Int("n_hidden", min_value=0, max_value=30, default=2)
    n_neurons = hp.Int("n_neurons", min_value=16, max_value=800)
    learning_rate = hp.Float("learning_rate", min_value=1e-4, max_value=1e-2,
                             sampling="log")
    optimizer = hp.Choice("optimizer", values=["sgd", "adam"])
    if optimizer == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten())
    for _ in range(n_hidden):
        model.add(tf.keras.layers.Dense(n_neurons, activation="relu"))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer=optimizer,
                  metrics=["accuracy"])
    return model

In [None]:
random_search_tuner = kt.RandomSearch(
    build_model, objective="val_accuracy", max_trials=5, overwrite=True,
    directory="Amex Hackathon", project_name="recommendation", seed=42)
random_search_tuner.search(X_train_ss, y_train, epochs=100,
                           validation_data=(X_test_ss, y_test))

In [None]:
top3_models = random_search_tuner.get_best_models(num_models=3)
best_model = top3_models[0]

In [None]:
top3_params = random_search_tuner.get_best_hyperparameters(num_trials=3)
top3_params[0].values

In [None]:
best_trial = random_search_tuner.oracle.get_best_trials(num_trials=1)[0]
best_trial.summary()

In [43]:
normalization_layer = tf.keras.layers.Normalization()
hidden_layer1 = tf.keras.layers.Dense(30, activation="relu")
hidden_layer2 = tf.keras.layers.Dense(30, activation="relu")
concat_layer = tf.keras.layers.Concatenate()
output_layer = tf.keras.layers.Dense(1, activation="sigmoid")

input_ = tf.keras.layers.Input(shape=X_train.shape[1:])
normalized = normalization_layer(input_)
hidden1 = hidden_layer1(normalized)
hidden2 = hidden_layer2(hidden1)
concat = concat_layer([normalized, hidden2])
output = output_layer(concat)

model_xxx = tf.keras.Model(inputs=[input_], outputs=[output])

In [44]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model_xxx.compile(loss="binary_crossentropy", optimizer=optimizer,
                  metrics=["accuracy"])
normalization_layer.adapt(X_train)
history = model_xxx.fit(X_train, y_train, epochs=20)
score = model_xxx.evaluate(X_test, y_test)


Epoch 1/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6779 - loss: 3.0317
Epoch 2/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7195 - loss: 1.5214
Epoch 3/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7275 - loss: 0.9006
Epoch 4/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7007 - loss: 1.0449
Epoch 5/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6791 - loss: 1.0379
Epoch 6/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7388 - loss: 0.7347
Epoch 7/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7497 - loss: 0.6611
Epoch 8/20
[1m3353/3353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7661 - loss: 0.6374
Epoch 9/20
[1m3353/3353

### Testing

In [2]:
df_test = pd.read_csv("amex_eval_round1_2.csv")

In [3]:
columns_to_drop = ['customer_merchant_01', 'customer_merchant_02',
                   'customer_digital_activity_01', 
                   'customer_digital_activity_02', 
                   'customer_digital_activity_07', 
                   'customer_digital_activity_08',
                   'customer_digital_activity_09',
                   'customer_digital_activity_18'
                   ]

df_test = df_test.drop(columns_to_drop, axis=1)


In [4]:
customer_ID = df_test['customer']
merchant_ID = df_test['merchant']

In [22]:
data = {'Customer_ID': customer_ID, 'Merchant_ID': merchant_ID}
df = pd.DataFrame(data)

# Write the DataFrame to a new CSV file with headers
df.to_csv('output.csv', index=False)
del customer_ID, merchant_ID

In [6]:
X = df_test.drop(['customer', 'merchant'],axis=1)

del df_test

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)  # Set the number of neighbors as per your dataset

X = knn_imputer.fit_transform(X)

nan_count = np.isnan(X).sum()
print("Number of NaN values in the array: ", nan_count)

In [8]:
# serialize the train and test data ready for continued use training models 
data = {'X': X}
with open('amex_submission.pickle', 'wb') as file:
    pickle.dump(data, file)

In [23]:
# load the data (run this everytime the kernel is restarted)
with open('amex_submission.pickle', 'rb') as file:
    data = pickle.load(file)
    X = data['X']

In [15]:
X.columns

Index(['distance_05', 'customer_digital_activity_04', 'customer_spend_01',
       'customer_industry_spend_01', 'customer_industry_spend_02',
       'customer_industry_spend_03', 'customer_industry_spend_04',
       'customer_industry_spend_05', 'customer_spend_02', 'customer_spend_03',
       'customer_spend_04', 'customer_spend_05', 'customer_spend_06',
       'customer_spend_07', 'merchant_spend_01', 'merchant_spend_02',
       'merchant_spend_03', 'merchant_spend_04', 'merchant_spend_05',
       'merchant_spend_06', 'merchant_spend_07', 'merchant_spend_08',
       'merchant_profile_01', 'customer_merchant_03', 'customer_profile_01',
       'customer_profile_02', 'customer_digital_activity_05',
       'customer_spend_13', 'customer_digital_activity_06',
       'customer_spend_14', 'customer_digital_activity_10',
       'customer_digital_activity_11', 'customer_digital_activity_12',
       'customer_digital_activity_13', 'customer_digital_activity_14',
       'customer_digital_activi

In [24]:
X.head

<bound method NDFrame.head of           distance_05  customer_digital_activity_04  customer_spend_01  \
0            1.621171                           NaN         112.334000   
1            2.441944                           NaN         112.334000   
2            2.438082                           NaN         112.334000   
3            2.072182                           NaN         112.334000   
4            2.380853                           NaN         302.792500   
...               ...                           ...                ...   
12604595    17.082029                           NaN         631.800000   
12604596    25.099151                           NaN          49.490000   
12604597     8.815975                           NaN         494.520000   
12604598    16.690912                           NaN          78.063171   
12604599    24.558746                           NaN          85.592927   

          customer_industry_spend_01  customer_industry_spend_02  \
0            

In [26]:
## The feature sequence of evaluation data differs from training set, we rearrange
column_order = X_train.columns.tolist()

# Reorder the columns of X_test to match the order of columns in X_train
X = X[column_order]


In [29]:
## RandomForest Classifier
best_params_rf = {'criterion': 'gini',
                    'max_depth': None,
                    'min_samples_split': 5,
                    'n_estimators': 300}

rf_test = RandomForestClassifier(**best_params_rf)
rf_test.fit(X_train, y_train)
y_pred_test_rf = rf_test.predict(X)


  return fit_method(estimator, *args, **kwargs)


In [30]:
y_pred_test_rf

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [32]:
y_pred_proba_test_rf = rf_test.predict_proba(X)

In [36]:
y_pred_proba_test_rf[:, 1:]

array([[0.54453175],
       [0.61021032],
       [0.55177778],
       ...,
       [0.55161111],
       [0.52939562],
       [0.49080592]])

In [39]:
## Generation of results
df_output = pd.read_csv('output.csv')
#header_output = ['customer', 'merchant', 'predicted_score']


predicted_scores = y_pred_proba_test_rf[:, 1:]
df_output['predicted_score'] = predicted_scores

# Rename the existing columns
df_output.rename(columns={'Customer_ID': 'customer', 'Merchant_ID': 'merchant'}, inplace=True)

# Write the modified DataFrame to a new CSV file
df_output.to_csv('ZXXX.csv', index=False)
