In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split, StratifiedKFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import pickle 

In [2]:
# Reduce Memory Usage
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type not in ['object', 'category']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Mem. usage decreased to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

In [3]:
# Load Data with Memory Optimization
def load_data():
    train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
    train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
    test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
    test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')

    # Standardize column names
    test_identity.columns = test_identity.columns.str.replace('-', '_')
    test_transaction.columns = test_transaction.columns.str.replace('-', '_')
    
    train = train_transaction.merge(train_identity, how='left', on='TransactionID')
    test = test_transaction.merge(test_identity, how='left', on='TransactionID')

    # Free up memory
    del train_transaction, train_identity, test_transaction, test_identity
    gc.collect()

    # Apply memory optimization
    train = reduce_memory_usage(train)
    test = reduce_memory_usage(test)
    
    return train, test

train, test = load_data()


Mem. usage decreased to 645.97 Mb (67.0% reduction)
Mem. usage decreased to 561.50 Mb (66.5% reduction)


In [4]:
train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [5]:
pd.options.display.max_columns = None  # Ensure all columns are shown
pd.options.display.max_rows = None  # Ensure all rows are shown

print(train.isna().sum())

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
card1                  0
card2               8933
card3               1565
card4               1577
card5               4259
card6               1571
addr1              65706
addr2              65706
dist1             352271
dist2             552913
P_emaildomain      94456
R_emaildomain     453249
C1                     0
C2                     0
C3                     0
C4                     0
C5                     0
C6                     0
C7                     0
C8                     0
C9                     0
C10                    0
C11                    0
C12                    0
C13                    0
C14                    0
D1                  1269
D2                280797
D3                262878
D4                168922
D5                309841
D6                517353
D7                551623
D8                515614
D9                515614


In [6]:
# Define Categorical Features
categorical_features = [
    'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2',
    'P_emaildomain', 'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
    'DeviceType', 'DeviceInfo'
]

# Add id_12 through id_38 to categorical features
id_features = [f'id_{i}' for i in range(12, 39)]
categorical_features.extend(id_features)

In [7]:
categorical_features

['ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'DeviceType',
 'DeviceInfo',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38']

In [8]:
train['id_12'].unique()

array([nan, 'NotFound', 'Found'], dtype=object)

In [9]:
# Select Numeric Features Only
numeric_features = [col for col in train.columns if col not in categorical_features + ['isFraud', 'TransactionID']]
X = train[numeric_features]
y = train['isFraud']
X_test = test[numeric_features]

In [10]:
# Handle Missing Values
X.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(-999, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.fillna(-999, inplace=True)


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [13]:


model = LogisticRegression(penalty='elasticnet', solver='saga', C=1.0, max_iter=300, tol = 1e-5, l1_ratio = 0.5)


In [14]:
def train_and_evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    val_pred = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, val_pred)
    print(f'{model_name} Validation AUC: {score:.4f}')
    return model, score

In [15]:
model , score = train_and_evaluate_model(model , 'logisticregression')

logisticregression Validation AUC: 0.7882




In [16]:
# Final Model Predictions
def generate_submission(model, X_test, filename):
    predictions = model.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame({'TransactionID': test['TransactionID'], 'isFraud': predictions})
    submission.to_csv(filename, index=False)
    print(f"Submission saved to {filename}")

def save_model(model, filename):
    with open(filename,'wb') as f:
        pickle.dump(model,f)

In [17]:
generate_submission(model, X_test, "logistic.csv")



Submission saved to logistic.csv


In [18]:

save_model(model,'logistic.pkl')
