##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
%pip install pandas 
%pip install matplotlib
%pip install scikit-learn
%pip install imblearn
%pip install pyarrow
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [1]:
# Can have as many cells as you want for code
import pandas as pd
import os
os.getcwd()
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [2]:
###...code...###
# pd.show_versions()
df = pd.read_parquet(filepath)
df.head() # can only use 710 values to 
df['f_purchase_lh'] = df['f_purchase_lh'].fillna(0) 

In [3]:
# drop low variance columns
columns = [col for col in df.columns if len(df[col].unique()) > 1]
df = df[columns]


# drop rows where the flag is NaN
df = df[df['flg_substandard'].notna()]
# drop some unwanted columns
df = df.drop(["clntnum","ctrycode_desc","flg_is_returned_mail",
              "is_consent_to_mail", "is_consent_to_email", "is_consent_to_call", "is_consent_to_sms","min_occ_date","cltdob_fix"], axis=1)
unwanted_pattern = ['ape_', 'sumins_', 'prempaid_']
unwanted_cols = [col for col in df.columns if any(pattern in col for pattern in unwanted_pattern)]
df = df.drop(unwanted_cols, axis=1)
print(df.shape)


columns = [col for col in df.columns if len(df[col].unique()) > 1]
df = df[columns]
# included_ape = [g for g in distinct_ape if g in columns]
# included_sumins = [g for g in distinct_sumins if g in columns]
# included_prempaid = [g for g in distinct_prempaid if g in columns]
# df

(16978, 141)


In [4]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute NaN values only for numerical columns using KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = knn_imputer.fit_transform(df[numeric_cols])

print(df.shape)
df = df.dropna(axis=1)
print(df.shape)
categorical_cols = df.select_dtypes(include=['object']).columns

# Perform one-hot encoding on categorical columns
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_category = pd.DataFrame(encoder.fit_transform(df[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate the encoded category with the original DataFrame
df_encoded = pd.concat([df.drop(categorical_cols, axis=1), encoded_category], axis=1)
df_encoded = df_encoded[df_encoded['f_purchase_lh'].notna()]
# print(df_encoded)

# Separate majority (label=0) and minority (label=1) classes
majority_class = df_encoded[df_encoded['f_purchase_lh'] == 0]
minority_class = df_encoded[df_encoded['f_purchase_lh'] == 1]
print(len(majority_class))


# minority_class_X = minority_class.drop('f_purchase_lh', axis=1).dropna(axis=1)
# majority_class_X = majority_class.drop('f_purchase_lh', axis=1).dropna(axis=1)
X = df_encoded.drop('f_purchase_lh', axis=1).dropna(axis=1)
y = df_encoded['f_purchase_lh']


# Downsample majority class FIRST
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
pipeline = Pipeline([('under', RandomUnderSampler(sampling_strategy=0.5)), ('over', SMOTE(sampling_strategy=0.75, random_state=42))])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)
# counter_0 = 0
# counter_1 = 0
# for i in y_resampled:
#     if i == 0:
#         counter_0+=1
#     else: 
#         counter_1 +=1
# print("counter0", counter_0)
# print("counter1", counter_1)

# Upsample minority class using SMOTE
# smote = SMOTE(sampling_strategy='auto', random_state=42)
# minority_class_upsampled, _ = smote.fit_resample(minority_class_X, minority_class['f_purchase_lh'])

# Downsample majority class
# majority_class_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class_upsampled), random_state=42)

# Combine the upsampled minority class with the downsampled majority class
# balanced_df = pd.concat([majority_class_downsampled, minority_class_upsampled])

# Shuffle the DataFrame
# balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and target variable
# X = balanced_df.drop('f_purchase_lh', axis=1)
# y = balanced_df['f_purchase_lh']

(16978, 140)
(16978, 107)
16278


[WinError 2] The system cannot find the file specified
  File "c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\user\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\user\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 969, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\user\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1438, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


In [5]:
%pip install pycaret

In [None]:
# import pycaret classification and init setup
from pycaret.classification import *
s = setup(df, target = 'f_purchase_lh', session_id = 123,train_size=0.8)

In [None]:
compare_models()

In [None]:
lr = create_model('lr', verbose=False)

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [14]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    result = [] 
    result = predict_model(lr, data = hidden_data)
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!