## Preprocessing (pipeline)

In [None]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output = "pandas")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from category_encoders import WOEEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

#### Load the data

In [None]:
data_train = pd.read_csv('../datasets/train.csv')
data_test = pd.read_csv('../datasets/test.csv')

pd.set_option('display.max_columns', None)

In [None]:
total_samples = data_train.shape[0] + data_test.shape[0]
train_distribution_percentage = (data_train.shape[0] / total_samples) * 100
test_distribution_percentage = (data_test.shape[0] / total_samples) * 100

print(f"Training Set Distribution: {train_distribution_percentage:.2f}% ({data_train.shape[0]} rows)")
print(f"Testing Set Distribution: {test_distribution_percentage:.2f}% ({data_test.shape[0]} rows)")

### Pre-processing ideas
- missing values: drop or impute? Maybe just do median imputation because there’s so little
- imbalance in target (15% vs 85%): use stratified CV! Evaluate with proper metrics! Use ensemble of models! Data augmentation (e.g. undersampling or SMOTE) or using class weights? 
- gender: one-hot encoding (binary indicator 1/0)
- tariff: weights of evidence or one-hot encoding (ordinality or not?)
- handset: WOE
- Usage_Band: ordinal so take this into account but also WOE maybe --> woe instead
- tariff_OK, high dropped calls and No Usage might be very uninformative because extremely imbalanced – if we use: one-hot encoding for all (change tariff_OK values to OK vs High, so regrouping the High CAT 100, High CAT 50 and High Play 100)
- for numerical ones i'm not sure, maybe some form of outlier detection and potentially some WOE


### **missing for now: outlier detection**
### **also look into this encoder for categorical variables: from category_encoders.cat_boost import CatBoostEncoder**

Random Forests, being an ensemble of decision trees, are generally not sensitive to the scale of numeric features. The reason is that decision trees make splits based on feature values but do not rely on the absolute scale of those values. Therefore, in many cases, scaling is not a strict requirement when using Random Forests. --> no standardization for now so we keep interpretability

### Modeling
- we get (1) a labeled dataset (train.csv) and (2) an unlabeled dataset (test.csv)
- split train.csv into a train and test set
- that train set, u should split into train and validation sets (stratified CV split because imbalance)
- that test set has labels, so u can compare the predictions on X_test, y_test with the labels to evaluate performance of the different models **NOTE: to fit a model on the test set that is coming from train.csv, u need to pass the tuned values of the hyperparameters (tuned on the validation set)**
- choose the best performing model 
- then make predictions on test.csv (unlabeled) and export to a csv file which you upload to the website

 note: after finding the optimal parameters, put the values in the pipeline (paramters of RandomForestRegressor)

### Other ideas 
- change objective function? to account for top 20 evaluation metric?
- use proftree? proflogit?

#### Missing values

In [None]:
# For training data
missing_count = data_train.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

In [None]:
rows_with_missing_values_train = data_train[data_train.isnull().any(axis=1)]
print("Rows with Missing Values in training data:")
rows_with_missing_values_train

In [None]:
# For test data
missing_count = data_test.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

In [None]:
rows_with_missing_values_test = data_test[data_test.isnull().any(axis=1)]
print("\nRows with Missing Values in test data:")
rows_with_missing_values_test

**we will impute this since it's so little rows**

In [None]:
#data_train = data_train.dropna()
#data_test = data_test.dropna()

FOR NOW I AM DROPPING BC ELSE GOT ERRORS FOR MY PIPELINE!

In [None]:
target_column = 'target'

# Separate features and target variable
X_train = data_train.drop(target_column, axis=1)
y_train = data_train[target_column]

X_test = data_test#.drop(target_column, axis=1)
#y_test = data_test[target_column]

don't know if this should be done after splitting or not

In [None]:
def process_date_column(data, date_column):
    # Convert the date column to datetime format
    data[date_column] = pd.to_datetime(data[date_column], format='%d/%m/%y')

    # Find the earliest date
    earliest_date = data[date_column].min()

    # Convert the date column to days since the earliest date
    data[date_column] = (data[date_column] - earliest_date).dt.days

    return data

X_train = process_date_column(X_train, 'Connect_Date')
X_test = process_date_column(X_test, 'Connect_Date')

#### Check the correlation

#### Split data into train and validation set -- should i use train test split instead? im confused

target variable is binary and imbalanced (with the minority class having a frequency of 15%), so using a stratified splitting approach is recommended to ensure that both the training and validation sets have a similar distribution of the target variable.

In [None]:
stratified_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, valid_index in stratified_splitter.split(X_train, y_train):
    X_train_split, X_valid_split = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
    # Now you can use X_train_split, y_train_split for training and X_valid_split, y_valid_split for validation

In [None]:
total_train_samples = X_train_split.shape[0] + X_valid_split.shape[0]
train_distribution_percentage = (X_train_split.shape[0]/ total_train_samples) * 100
validation_distribution_percentage = (X_valid_split.shape[0] / total_train_samples) * 100

print(f"Training Set Distribution: {train_distribution_percentage:.2f}% ({X_train_split.shape[0]} rows)")
print(f"Validation Set Distribution: {validation_distribution_percentage:.2f}% ({X_valid_split.shape[0]} rows)")

#### Pipeline

In [None]:
X_train_split['Tariff_OK'] = np.where(X_train_split['Tariff_OK'] == 'OK', 1, 0)
X_valid_split['Tariff_OK'] = np.where(X_valid_split['Tariff_OK'] == 'OK', 1, 0)
X_test['Tariff_OK'] = np.where(X_test['Tariff_OK'] == 'OK', 1, 0)

In [None]:
X_train_split.head()

In [None]:
# Assuming 'id' is the name of the column
# Convert 'id' column to sets to get unique values
X_train_ids = set(X_train_split['id'])
y_train_ids = set(y_train_split.index)

# Check for overlapping values
overlapping_ids = X_train_ids.intersection(y_train_ids)

if overlapping_ids:
    print("There are overlapping values for the 'id' column between X_train_split and y_train_split.")
    print("Overlapping IDs:", overlapping_ids)
else:
    print("There are no overlapping values for the 'id' column between X_train_split and y_train_split. This is how it should be.")

In [None]:
# Custom transformer to remove prefix from column names
class RemovePrefixTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, prefixes):
        self.prefixes = prefixes

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for prefix in self.prefixes:
            X.columns = [col.split(f'{prefix}__')[1] if f'{prefix}__' in col else col for col in X.columns]
        return X

If you want to apply both mode imputation and ordinal encoding to the 'Usage_Band' column, you can achieve this by creating a custom transformer using scikit-learn's FunctionTransformer. Here's how you can do it:

In [None]:
'''from sklearn.preprocessing import FunctionTransformer

# Define a function for mode imputation
def mode_imputation(data):
    mode_value = data.mode().iloc[0]  # Calculate the mode
    return data.fillna(mode_value)    # Fill missing values with the mode

# Define a custom transformer for mode imputation
mode_imputer = FunctionTransformer(mode_imputation)'''

In [None]:
print(X_train_split['Usage_Band'].unique())

you can handle them explicitly before preprocessing, for example, by replacing them with the most frequent category using fillna()

In [None]:
X_train_split = X_train_split.copy()
X_train_split['Usage_Band'] = X_train_split['Usage_Band'].fillna(X_train_split['Usage_Band'].mode()[0])
print(X_train_split['Usage_Band'].unique())

In [None]:
y_train_split = y_train_split.fillna(y_train_split.mode()[0])
y_valid_split = y_valid_split.fillna(y_train_split.mode()[0])

In [None]:
X_valid_split = X_valid_split.copy()

# Handle missing values in 'Usage_Band' for X_validation_split
X_valid_split['Usage_Band'] = X_valid_split['Usage_Band'].fillna(X_train_split['Usage_Band'].mode()[0])

# Handle missing values in 'Dropped_calls_ratio' by filling with the median
X_train_split['Dropped_calls_ratio'] = X_train_split['Dropped_calls_ratio'].fillna(X_train_split['Dropped_calls_ratio'].median())

# Handle missing values in 'call_cost_per_min' by filling with the median
X_train_split['call_cost_per_min'] = X_train_split['call_cost_per_min'].fillna(X_train_split['call_cost_per_min'].median())

In [None]:
X_test = X_test.copy()
X_test['Usage_Band'] = X_test['Usage_Band'].fillna(X_train_split['Usage_Band'].mode()[0])
print(X_test['Usage_Band'].unique())

# Handle missing values in 'Dropped_calls_ratio' by filling with the median
X_test['Dropped_calls_ratio'] = X_test['Dropped_calls_ratio'].fillna(X_train_split['Dropped_calls_ratio'].median())

# Handle missing values in 'call_cost_per_min' by filling with the median
X_test['call_cost_per_min'] = X_test['call_cost_per_min'].fillna(X_train_split['call_cost_per_min'].median())

In [None]:
X_train_split.head()

In [None]:
# Define columns to drop
columns_to_drop = ['id']  # Drop because it's not numerical, later on add it back to know which prediction corresponds to which individual

# Define columns for different encoding methods
one_hot_encode_columns = ['Gender', 'high Dropped calls', 'No Usage']
woe_encode_columns = ['tariff', 'Handset', 'Usage_Band'] #ipv ordinal endoding
ordinal_encode_columns = ['Usage_Band']
impute_num = ['Dropped_calls_ratio', 'call_cost_per_min']
impute_cat = ['Usage_Band']
#numeric_columns = X_train_split.select_dtypes(include=['int64', 'float64']).columns
#categorical_columns = X_train_split.select_dtypes(include=['object']).columns
#categorical_columns = [col for col in categorical_columns if col != 'id']

# Define the preprocessing steps for each column
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', 'drop', columns_to_drop),
        ('impute_median', SimpleImputer(strategy='median'), impute_num),
        #('impute_mode', SimpleImputer(strategy='most_frequent'), impute_cat),
        ('one_hot_encode', OneHotEncoder(drop='first', sparse_output=False), one_hot_encode_columns),
        ('WOE_encode', WOEEncoder(), woe_encode_columns),
        #('ordinal_encode', OrdinalEncoder(categories=[['Low', 'MedLow', 'Med', 'MedHigh', 'High']]), ['Usage_Band']) #ordinal_encode_columns
    ],
    remainder='passthrough'  # Keep the remaining columns as they are
)

# Build the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('remove_prefix', RemovePrefixTransformer(prefixes=['impute_median', 'one_hot_encode', 'WOE_encode', 'remainder']))  #'ordinal_encode' # Add this step to remove the prefix
])

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42) #class_weight='balanced' - but gives same AUC on validation set

lgb_classifier = lgb.LGBMClassifier(is_unbalance=True)
#lgb_classifier = lgb.LGBMClassifier(scale_pos_weight=(1 - y_train_split.sum() / len(y_train_split)))

xgb_classifier= xgb.XGBClassifier(scale_pos_weight=(1 - y_train_split.sum() / len(y_train_split)))

# Build the full pipeline with preprocessing and model
rf_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', rf_classifier)
])

lgb_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', lgb_classifier)
])

xgb_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', xgb_classifier)
])

repeat last 2 code lines for the other 2 models

In [None]:
# Define evaluation metrics
def profit_at_top_20(y_true, y_probabilities, top_k=20):
    # Extract probabilities for positive class
    churn_probabilities = y_probabilities[:, 1]

    # Sort customers by predicted probabilities in descending order
    sorted_indices = sorted(range(len(churn_probabilities)), key=lambda k: churn_probabilities[k], reverse=True)

    # Identify the top-20 customers
    top_20_indices = sorted_indices[:top_k]

    # Calculate profit at top-20
    profit = sum(y_true[i] * churn_probabilities[i] for i in top_20_indices)

    return profit

# Define custom scorer for use in GridSearchCV or RandomizedSearchCV
profit_at_top_20_scorer = make_scorer(profit_at_top_20, greater_is_better=True)

In [None]:
# Assuming X_train_split is your training data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train_split, y_train_split)
X_train_preprocessed

In [None]:
unique_dtypes = X_train_preprocessed.dtypes.unique()

print("Unique data types:")
print(unique_dtypes)

only numerical so that's good

In [None]:
# Show rows with missing values in the 'Usage_Band' column
missing_values = X_train_preprocessed[X_train_preprocessed.isnull().any(axis=1)]
missing_values

used a separate custom classifier to keep the names as original variables

#### Random Forest

In [None]:
''' 
# Create GridSearchCV instance
grid_search = GridSearchCV(full_pipeline, param_grid, scoring={'profit_at_top_20': profit_at_top_20_scorer, 'auc': 'roc_auc'},
                           refit='profit_at_top_20', cv=5, verbose=2, n_jobs=-1)

# Fit the GridSearchCV on training data
grid_search.fit(X_train_split, y_train_split)
'''

In [None]:
# Build parameter grid for hyperparameter tuning
rf_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5], # maximum number of levels allowed in each decision tree
    'model__min_samples_split': [2, 4, 6, 8, 10],
    'model__min_samples_leaf': [1, 2, 3, 4]
}

# Create GridSearchCV instance
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', cv=5, verbose=2, n_jobs=1)

# Fit the GridSearchCV on training data
rf_grid_search.fit(X_train_split, y_train_split)

In [None]:
# Get the best model from the grid search
best_model = rf_grid_search.best_estimator_

# Evaluate on the validation set
y_valid_probabilities = best_model.predict_proba(X_valid_split)
#profit_at_top_20_score = profit_at_top_20(y_valid_split, y_valid_probabilities)
auc_score = roc_auc_score(y_valid_split, y_valid_probabilities[:, 1])

#print(f'Profit at Top-20: {profit_at_top_20_score}')
print(f'AUC on Validation Set: {auc_score}')

# Access the best hyperparameters
best_hyperparameters_RF = rf_grid_search.best_params_
print(f'Best Hyperparameters: {best_hyperparameters_RF}')

# Evaluate on the test set
y_test_probabilities = best_model.predict_proba(X_test)
'''
auc_score_test = roc_auc_score(y_test, y_test_probabilities[:, 1])
print(f'AUC on Test Set: {auc_score_test}')
'''

In [None]:
# Access the LGBM model from the pipeline
best_rf_model = rf_grid_search.best_estimator_.named_steps['model']

# Get feature importances from the LGBM model
feature_importances = best_rf_model.feature_importances_

# Map feature names to their importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort features based on their importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print or visualize the feature importance
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {float('{:.2f}'.format(importance))}")

#### LGB

In [192]:
# Create parameter grid for LightGBM hyperparameter tuning
lgb_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
}

X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train_split, y_train_split)

lgb_classifier = lgb.LGBMClassifier(is_unbalance=True)

# Create GridSearchCV instance for LightGBM
lgb_grid_search = GridSearchCV(lgb_classifier, lgb_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', verbose=0, cv=5, n_jobs=-1)

# Fit the GridSearchCV on training data for LightGBM
lgb_grid_search.fit(X_train_preprocessed, y_train_split)

[LightGBM] [Info] Number of positive: 596, number of negative: 3440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7077
[LightGBM] [Info] Number of data points in the train set: 4036, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.147671 -> initscore=-1.752986
[LightGBM] [Info] Start training from score -1.752986


In [None]:
# Create parameter grid for LightGBM hyperparameter tuning
lgb_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
}

# Create GridSearchCV instance for LightGBM
lgb_grid_search = GridSearchCV(lgb_pipeline, lgb_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', verbose=0, cv=5, n_jobs=1)

# Fit the GridSearchCV on training data for LightGBM
lgb_grid_search.fit(X_train_split, y_train_split)

Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31)

is_unbalance=True, learning_rate=0.05, max_depth=4,
               n_estimators=160) (verw missing values)

met nog ordinal : (is_unbalance=True, learning_rate=0.01, max_depth=1,
               n_estimators=150)

In [193]:
lgb_grid_search.best_estimator_ #has the optimal hyperparameters

In [194]:
print("Best Score:", lgb_grid_search.best_score_)
print("Best Parameters:", lgb_grid_search.best_params_)

Best Score: 0.9374601613901374
Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 1, 'model__n_estimators': 150}


hieronder overal X_train_split vervangen door X_train_preprocessed

In [196]:
# Get the best LightGBM model from the grid search
best_lgb_model = lgb_grid_search.best_estimator_

# Fit the best model on the training data
best_lgb_model.fit(X_train_preprocessed, y_train_split) #X_train_split

[LightGBM] [Info] Number of positive: 596, number of negative: 3440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7077
[LightGBM] [Info] Number of data points in the train set: 4036, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.147671 -> initscore=-1.752986
[LightGBM] [Info] Start training from score -1.752986


In [199]:
# Apply preprocessing pipeline to the validation set IPV FIT_TRANSFORM GWN TRANSFORM BC INFO VAN TRAINING SET
X_valid_preprocessed = preprocessing_pipeline.transform(X_valid_split)
X_valid_split = X_valid_preprocessed


In [200]:
pred = best_lgb_model.predict(X_valid_split)
# Set the printing options to display all elements of the array
np.set_printoptions(threshold=np.inf)

# Print the entire array of predictions
print(pred)

[0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0
 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 1 0 0 

In [201]:
# Evaluate on the validation set for LightGBM
y_valid_probabilities_lgb = best_lgb_model.predict_proba(X_valid_split)
y_valid_probabilities_lgb



array([[9.95346907e-01, 4.65309338e-03],
       [9.79702249e-01, 2.02977507e-02],
       [7.78354031e-03, 9.92216460e-01],
       [5.68036832e-02, 9.43196317e-01],
       [9.85857462e-01, 1.41425375e-02],
       [8.27880349e-02, 9.17211965e-01],
       [9.89694557e-01, 1.03054430e-02],
       [9.42332366e-01, 5.76676335e-02],
       [9.96149423e-01, 3.85057741e-03],
       [9.96017706e-01, 3.98229398e-03],
       [8.14044576e-01, 1.85955424e-01],
       [9.77326258e-01, 2.26737417e-02],
       [9.96285152e-01, 3.71484811e-03],
       [2.14965343e-02, 9.78503466e-01],
       [9.74910896e-01, 2.50891045e-02],
       [9.91965416e-01, 8.03458384e-03],
       [9.96558043e-01, 3.44195707e-03],
       [2.09047838e-01, 7.90952162e-01],
       [9.95825760e-01, 4.17423995e-03],
       [7.02144870e-02, 9.29785513e-01],
       [9.70185069e-01, 2.98149306e-02],
       [9.99165222e-01, 8.34778396e-04],
       [9.96848039e-01, 3.15196071e-03],
       [9.92139009e-01, 7.86099090e-03],
       [9.983557

In [None]:
'''# DISABLE THIS CELL TO RUN THE CODE AS IS; USE THIS CELL TO CHECK WHAT THE SCORES ARE FOR THE OLD RESULTS I HAD WHEN I DROPPED NA's:
best_lgb_model.set_params(model__is_unbalance=True,
                          model__learning_rate=0.05,
                          model__max_depth=4,
                          model__n_estimators=160)

# Fit the best model on the training data
best_lgb_model.fit(X_train_split, y_train_split)
'''

In [202]:
# Evaluate on the validation set for LightGBM
y_valid_probabilities_lgb = best_lgb_model.predict_proba(X_valid_split)
auc_score_lgb = roc_auc_score(y_valid_split, y_valid_probabilities_lgb[:, 1])

print(f'AUC for LightGBM on Validation Set: {auc_score_lgb}')

AUC for LightGBM on Validation Set: 0.9501996234110212


In [203]:
# Access the best hyperparameters for LightGBM
best_hyperparameters_LGB = lgb_grid_search.best_params_
print(f'Best Hyperparameters for LightGBM: {best_hyperparameters_LGB}')

Best Hyperparameters for LightGBM: {'model__learning_rate': 0.01, 'model__max_depth': 1, 'model__n_estimators': 150}


In [205]:
# Apply preprocessing pipeline to the validation set
X_test_preprocessed = preprocessing_pipeline.transform(X_test)
X_test = X_test_preprocessed

In [206]:
# Evaluate on the test set for LightGBM
y_test_probabilities_lgb = best_lgb_model.predict_proba(X_test)
'''
auc_score_test_lgb = roc_auc_score(y_test, y_test_probabilities_lgb[:, 1])
print(f'AUC for LightGBM on Test Set: {auc_score_test_lgb}')
'''



"\nauc_score_test_lgb = roc_auc_score(y_test, y_test_probabilities_lgb[:, 1])\nprint(f'AUC for LightGBM on Test Set: {auc_score_test_lgb}')\n"

In [207]:
y_test_probabilities_lgb
y_test_probabilities_lgb = pd.DataFrame(y_test_probabilities_lgb, columns=['PROB_0', 'PROB_1'])
y_test_probabilities_lgb_with_id = pd.concat([data_test['id'], y_test_probabilities_lgb], axis=1)
y_test_probabilities_lgb_with_id

Unnamed: 0,id,PROB_0,PROB_1
0,K751808,0.999400,0.000600
1,K837351,0.916140,0.083860
2,K548114,0.934870,0.065130
3,K736156,0.992434,0.007566
4,K508080,0.995004,0.004996
...,...,...,...
1677,K588314,0.976708,0.023292
1678,K826807,0.910674,0.089326
1679,K982731,0.979874,0.020126
1680,K623037,0.976952,0.023048


In [208]:
result_LGB = y_test_probabilities_lgb_with_id.iloc[:, [0, 2]]
result_LGB.to_csv('result_LGB_3.csv', header=False, index=False)

In [209]:
result_LGB

Unnamed: 0,id,PROB_1
0,K751808,0.000600
1,K837351,0.083860
2,K548114,0.065130
3,K736156,0.007566
4,K508080,0.004996
...,...,...
1677,K588314,0.023292
1678,K826807,0.089326
1679,K982731,0.020126
1680,K623037,0.023048


In [None]:
# Access the LGBM model from the pipeline
best_lgb_model = lgb_grid_search.best_estimator_.named_steps['model']

# Get feature importances from the LGBM model
feature_importances = best_lgb_model.feature_importances_

# Map feature names to their importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort features based on their importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print or visualize the feature importance
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

#### XGB

In [None]:
# Create parameter grid for XGBoost hyperparameter tuning
xgb_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
}

# Create GridSearchCV instance for XGBoost
xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', cv=5, verbose=2, n_jobs=1)

# Fit the GridSearchCV on training data for XGBoost
xgb_grid_search.fit(X_train_split, y_train_split)

In [None]:
cv=KFold(n_splits=10)
search=BayesSearchCV(model,search_spaces=random_grid,n_jobs=-1,cv=cv,n_iter=50, scoring='neg_mean_squared_error')

# Find optimal parameters
search.fit(X_train,y_train)
search.best_score_
search.best_estimator_
search.best_params_


model.fit(X_train,y_train)
pred = model.predict(X_val)
MSE = mean_squared_error(y_val, pred)
RMSE=np.sqrt(MSE)
RMSE
MAE=mean_absolute_error(y_val, pred)
MAE


pred_f = model.predict(df_test)
pred_df = df_test[['property_id']]
pred_df['pred_price'] = pred_f
pred_df.to_csv('pred_rf_pipe2.csv', header=False, index=False)

In [None]:
# Get the best XGBoost model from the grid search
best_xgb_model = xgb_grid_search.best_estimator_

# Evaluate on the validation set for XGBoost
y_valid_probabilities_xgb = best_xgb_model.predict_proba(X_valid_split)
auc_score_xgb = roc_auc_score(y_valid_split, y_valid_probabilities_xgb[:, 1])

print(f'AUC for XGBoost on Validation Set: {auc_score_xgb}')

# Access the best hyperparameters for XGBoost
best_hyperparameters_XGB = xgb_grid_search.best_params_
print(f'Best Hyperparameters for XGBoost: {best_hyperparameters_XGB}')

# Evaluate on the test set for XGBoost
y_test_probabilities_xgb = best_xgb_model.predict_proba(X_test)
'''
auc_score_test_xgb = roc_auc_score(y_test, y_test_probabilities_xgb[:, 1])
print(f'AUC for XGBoost on Test Set: {auc_score_test_xgb}')
'''

In [None]:
# Access the LGBM model from the pipeline
best_xgb_model = xgb_grid_search.best_estimator_.named_steps['model']

# Get feature importances from the LGBM model
feature_importances = best_xgb_model.feature_importances_

# Map feature names to their importance scores
feature_names = X_train.columns  # Replace with your actual feature names
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort features based on their importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {float('{:.2f}'.format(importance))}")

## Models

use these:
- RF
- lightgbm
- xgboost

imbalanced data so:
- use stratified CV to ensure that each fold maintains the class distribution
- evaluate with proper metrics (as given by the prof)
- use an ensemble of models (hence the above techniques - but also they are best techniques to work with tabular data)
- possibly do data augmentation with techniques like SMOTE to make the distribution more balanced

also this exists to deal with imbalance:

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(class_weight='balanced')

In [None]:
xgb_model = xgb.XGBClassifier(scale_pos_weight=(1 - y.sum() / len(y)))

In [None]:
# or lgb_model = lgb.LGBMClassifier(is_unbalance=True)
lgb_model = lgb.LGBMClassifier(scale_pos_weight=(1 - y.sum() / len(y)))