In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import numpy as np
from scipy.sparse import csc_matrix
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def preprocess_data(dfTrain, dfTest):
    # Assuming dfTrain and dfTest are already defined
    # Drop unnecessary columns and preprocess longitude values for dfTrain
    dfTrain = dfTrain.drop(columns=['trans_date_trans_time', 'dob', 'trans_num', 'Unnamed: 0'])
    dfTrain['long'] = (dfTrain['long'] + 360) % 360
    dfTrain['merch_long'] = (dfTrain['merch_long'] + 360) % 360

    # Drop unnecessary columns and preprocess longitude values for dfTest
    dfTest = dfTest.drop(columns=['trans_date_trans_time', 'dob', 'trans_num', 'Unnamed: 0'])
    dfTest['long'] = (dfTest['long'] + 360) % 360
    dfTest['merch_long'] = (dfTest['merch_long'] + 360) % 360

    # Separate categorical and numerical columns for dfTrain
    categorical_cols = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']
    numerical_cols = [col for col in dfTrain.columns if col not in categorical_cols + ['is_fraud']]

    # Separate features and target variable for dfTrain
    X_train = dfTrain.drop(columns=['is_fraud'])
    y_train = dfTrain['is_fraud']

    # Separate features and target variable for dfTest
    X_test = dfTest.drop(columns=['is_fraud'])
    y_test = dfTest['is_fraud']

    # Get the indices of the numerical columns
    numerical_indices = [dfTrain.columns.get_loc(col) for col in numerical_cols]

    # Use SelectKBest with chi-squared test to select top features
    selector = SelectKBest(score_func=chi2, k=5)  # Select top 5 features based on chi-squared test
    X_train_selected = selector.fit_transform(X_train.iloc[:, numerical_indices], y_train)
    X_test_selected = selector.transform(X_test.iloc[:, numerical_indices])

    # Get the indices of the selected features
    selected_indices = selector.get_support(indices=True)

    # Get the names of the selected features
    selected_feature_names = [numerical_cols[i] for i in selected_indices]


    # Assuming you have already processed the dataset and named it df_processed
    # Define the categorical and numerical columns based on the processed dataset
    categorical_cols = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']
    numerical_cols = ['cc_num', 'amt', 'city_pop', 'unix_time']
    is_fraud=['is_fraud']


    # Create a new dataset with only the specified columns
    new_df_Train = dfTrain[categorical_cols + numerical_cols+is_fraud]
    new_df_Test=dfTest[categorical_cols + numerical_cols+is_fraud]


    # Ensure 'is_fraud' is included for target variable in both datasets
    new_df_Train = new_df_Train[categorical_cols + numerical_cols + ['is_fraud']]
    new_df_Test = new_df_Test[categorical_cols + numerical_cols + ['is_fraud']]

    # Encode categorical columns using LabelEncoder for new_df_Train
    train_encoded = new_df_Train.copy()
    for col in categorical_cols:
        le = LabelEncoder()
        train_encoded[col] = le.fit_transform(new_df_Train[col])

    # Encode categorical columns using LabelEncoder for new_df_Test
    test_encoded = new_df_Test.copy()
    for col in categorical_cols:
        le = LabelEncoder()
        test_encoded[col] = le.fit_transform(new_df_Test[col])

    # Separate features and target variable for new_df_Train
    X_train = train_encoded[categorical_cols + numerical_cols]
    y_train = train_encoded['is_fraud']

    # Separate features and target variable for new_df_Test
    X_test = test_encoded[categorical_cols + numerical_cols]
    y_test = test_encoded['is_fraud']

    # Use SelectKBest with chi-squared test to select top categorical features
    categorical_selector = SelectKBest(score_func=chi2, k='all')  # Use 'all' to get scores for all features
    X_cat_selected = categorical_selector.fit_transform(X_train[categorical_cols], y_train)

    # Get the scores for each categorical feature
    categorical_scores = categorical_selector.scores_

    # Create a DataFrame to display the scores
    cat_feature_scores = pd.DataFrame({'Feature': categorical_cols, 'Score': categorical_scores})
    cat_feature_scores = cat_feature_scores.sort_values(by='Score', ascending=False)


    new_categorical_cols = ['category']
    new_df_Test = new_df_Test[new_categorical_cols + numerical_cols+is_fraud]
    new_df_Train=new_df_Train[new_categorical_cols + numerical_cols+is_fraud]



    # Assuming new_df_Train and new_df_Test are already defined
    # Define the important categorical features
    new_categorical_cols = ['category']

    # Define the numerical columns
    numerical_cols = ['cc_num', 'amt', 'city_pop', 'unix_time']

    # Define the column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), new_categorical_cols)
        ]
    )

    # Apply the preprocessing to the training dataset
    X_train_processed = preprocessor.fit_transform(new_df_Train[new_categorical_cols + numerical_cols])

    # Apply the same preprocessing to the test dataset
    X_test_processed = preprocessor.transform(new_df_Test[new_categorical_cols + numerical_cols])

    # Convert to sparse DataFrames
    encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(new_categorical_cols)
    all_feature_names = numerical_cols + list(encoded_feature_names)
    X_train_processed_df = pd.DataFrame.sparse.from_spmatrix(X_train_processed, columns=all_feature_names)
    X_test_processed_df = pd.DataFrame.sparse.from_spmatrix(X_test_processed, columns=all_feature_names)


    continuous_cols = ['cc_num', 'amt', 'city_pop', 'unix_time']
    binary_cols = [col for col in X_train_processed_df.columns if col not in continuous_cols]

    # Normalize continuous columns
    scaler = StandardScaler()
    X_train_processed_df[continuous_cols] = scaler.fit_transform(dfTrain[continuous_cols])

    # The binary columns do not require normalization
    # If needed, ensure binary columns are in correct format (0 or 1)
    X_train_processed_df[binary_cols] = X_train_processed_df[binary_cols].astype(int)



    continuous_cols = ['cc_num', 'amt', 'city_pop', 'unix_time']
    binary_cols = [col for col in X_test_processed_df.columns if col not in continuous_cols]

    # Normalize continuous columns
    scaler = StandardScaler()
    X_test_processed_df[continuous_cols] = scaler.fit_transform(dfTest[continuous_cols])

    # The binary columns do not require normalization
    # If needed, ensure binary columns are in correct format (0 or 1)
    X_test_processed_df[binary_cols] = X_test_processed_df[binary_cols].astype(int)
    X_train = X_train_processed_df
    X_test = X_test_processed_df
    y_train = new_df_Train['is_fraud']
    y_test = new_df_Test['is_fraud']
    return X_train,X_test,y_train,y_test
