In [10]:
import pandas as pd

df = pd.read_csv('/content/mlfin.csv')
# Select columns to keep
keep_cols = df.filter(items=['duration_bins', 'duration_minutes'])
# Drop columns containing the text "duration"

drop_cols = df.filter(like='duration').columns
df = df.drop(drop_cols, axis=1)
df = pd.concat([df, keep_cols], axis=1)

#replacing unknown values with 'other' because unknown is a hot keyword in the label encoder library
df = df.replace('unknown', 'others')

Selecting top features via 3 different methods and then taking top common features out of their results

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def select_features_mutual_info(df, target_col, k):
    """
    This function takes a pandas dataframe, the name of the target column, and the desired number of features to select as input. 
    It performs mutual information-based feature selection on the input data to select the top k features, and returns the list of selected features.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The input data, where each row is an observation and each column is a feature.
    target_col : str
        The name of the column that contains the target variable.
    k : int
        The desired number of features to select.
    
    Returns:
    --------
    selected_features : list
        The list of selected features.
    """
    
    # Separate the feature matrix X and the target variable y
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Encode categorical variables using Label Encoding
    cat_cols = X.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

    # Select the top k features using mutual information
    selector = SelectKBest(mutual_info_classif, k=k)
    selector.fit(X, y)

    # Get the names of the selected features
    selected_features = list(X.columns[selector.get_support()])

    return selected_features

In [12]:
from sklearn.ensemble import RandomForestClassifier

def select_features_rf(df, target_col, n, max_depth=None):
    """
    Selects the top n most important features using a Random Forest Classifier.

    Args:
    - df (pandas DataFrame): The input DataFrame.
    - target_col (str): The name of the target column.
    - n (int): The number of top features to select.
    - max_depth (int): The maximum depth of the decision trees in the Random Forest. Defaults to None.

    Returns:
    - selected_features (list): A list of the names of the top n features.
    """

    # Separate the feature matrix X and the target variable y
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Encode categorical variables using Label Encoding
    cat_cols = X.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
      le = LabelEncoder()
      X[col] = le.fit_transform(X[col].astype(str))
    
    # Create a RandomForestClassifier object and fit it to the data
    clf = RandomForestClassifier(n_estimators=n, max_depth=max_depth)
    clf.fit(X, y)

    
    # Get the feature importances and sort them in descending order
    feature_importances = clf.feature_importances_
    indices = feature_importances.argsort()[::-1]
    
    # Extract the names of the top features based on the indices
    selected_features = list(X.columns[indices[:n]])
    
    return selected_features

In [13]:
from xgboost import XGBClassifier

def select_features_xgb(df, target_col, n, max_depth=3, learning_rate=0.1):
    """
    Selects the top n most important features using an XGBoost Classifier.

    Args:
    - df (pandas DataFrame): The input DataFrame.
    - target_col (str): The name of the target column.
    - n (int): The number of top features to select.
    - max_depth (int): The maximum depth of the decision trees in the XGBoost Classifier. Defaults to 3.
    - learning_rate (float): The learning rate of the XGBoost Classifier. Defaults to 0.1.

    Returns:
    - selected_features (list): A list of the names of the top n features.
    """
    # Separate the feature matrix X and the target variable y
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Encode categorical variables using Label Encoding
    cat_cols = X.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
      le = LabelEncoder()
      X[col] = le.fit_transform(X[col].astype(str))    
    
    # Create an XGBClassifier object and fit it to the data
    clf = XGBClassifier(n_estimators=n, max_depth=max_depth, learning_rate=learning_rate)
    clf.fit(X, y)
    
    # Get the feature importances and sort them in descending order
    feature_importances = clf.feature_importances_
    indices = feature_importances.argsort()[::-1]
    
    # Extract the names of the top features based on the indices
    selected_features = list(X.columns[indices[:n]])
    
    return selected_features

In [14]:
sklearn_features = select_features_mutual_info(df,'term_deposit',60)
rf_features = select_features_rf(df,'term_deposit',60)
xgb_features = select_features_xgb(df,'term_deposit',60)
top_features = list(set(rf_features) & set(sklearn_features) & set(xgb_features))
top_features.append('term_deposit')
df_final = df[top_features]

In [15]:
df_final.columns

Index(['nr.employed', 'campaign cons.conf.idx', 'pdays emp.var.rate',
       'edu_dur', 'campaign cons.price.idx', 'pdays cons.price.idx',
       'age cons.conf.idx', 'emp.var.rate euribor3m', 'euribor3m', 'month',
       'cons.price.idx nr.employed', 'previous euribor3m',
       'pdays cons.conf.idx', 'pdays euribor3m', 'pdays nr.employed',
       'campaign euribor3m', 'emp.var.rate cons.price.idx', 'age euribor3m',
       'emp.var.rate cons.conf.idx', 'age emp.var.rate', 'campaign pdays',
       'euribor3m nr.employed', 'poutcome', 'cons.price.idx euribor3m',
       'duration_minutes', 'age pdays', 'campaign nr.employed',
       'cons.conf.idx euribor3m', 'pdays', 'evr_job', 'term_deposit'],
      dtype='object')

In [16]:
df_final.head()

Unnamed: 0,nr.employed,campaign cons.conf.idx,pdays emp.var.rate,edu_dur,campaign cons.price.idx,pdays cons.price.idx,age cons.conf.idx,emp.var.rate euribor3m,euribor3m,month,...,euribor3m nr.employed,poutcome,cons.price.idx euribor3m,duration_minutes,age pdays,campaign nr.employed,cons.conf.idx euribor3m,pdays,evr_job,term_deposit
0,5191.0,-36.4,1098.9,basic.4y0-5,93.994,93900.006,-2038.4,5.3427,4.857,may,...,25212.687,nonexistent,456.528858,4.35,55944.0,5191.0,-176.7948,999,Highhousemaid,0
1,5191.0,-36.4,1098.9,high.school0-5,93.994,93900.006,-2074.8,5.3427,4.857,may,...,25212.687,nonexistent,456.528858,2.483333,56943.0,5191.0,-176.7948,999,Highservices,0
2,5191.0,-36.4,1098.9,high.school0-5,93.994,93900.006,-1346.8,5.3427,4.857,may,...,25212.687,nonexistent,456.528858,3.766667,36963.0,5191.0,-176.7948,999,Highservices,0
3,5191.0,-36.4,1098.9,basic.6y0-5,93.994,93900.006,-1456.0,5.3427,4.857,may,...,25212.687,nonexistent,456.528858,2.516667,39960.0,5191.0,-176.7948,999,Highadmin.,0
4,5191.0,-36.4,1098.9,high.school5-10,93.994,93900.006,-2038.4,5.3427,4.857,may,...,25212.687,nonexistent,456.528858,5.116667,55944.0,5191.0,-176.7948,999,Highservices,0


In [None]:
# export the DataFrame as a CSV file
df_final.to_csv('ml_final.csv', index=False)