In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Load the datasets
train_df = pd.read_parquet('/Users/andreapiemontese/Desktop/2Round.nosync/parquet_input_files/t_data_train.parquet')
test_df = pd.read_parquet('/Users/andreapiemontese/Desktop/2Round.nosync/parquet_input_files/t_data_test.parquet')

# Handle missing values (if any)
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)

# Encode categorical variables
train_df_encoded = pd.get_dummies(train_df.drop(columns=['customer']), columns=['merchant','gender','age','industry'], drop_first=True)
test_df_encoded = pd.get_dummies(test_df.drop(columns=['customer']), columns=['merchant','gender','age','industry'], drop_first=True)

# Align the train and test data to have the same columns
train_df_encoded, test_df_encoded = train_df_encoded.align(test_df_encoded, join='left', axis=1, fill_value=0)

# Split features and target
X_train, y_train  = train_df_encoded.drop(columns=['fraud']),  train_df_encoded['fraud']
X_test, y_test = test_df_encoded.drop(columns=['fraud']), test_df_encoded['fraud']

# Split the training data into training and validation sets
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Scale numerical features (optional)
scaler = StandardScaler()
X_train_scaled, X_test_scaled  = scaler.fit_transform(X_train), scaler.transform(X_test)


# Display the preprocessed data
X_train_scaled[:5], y_train[:5]


(array([[-1.45434326e-01, -5.99839476e-03, -5.17708947e-02,
         -1.81734247e-02, -3.01107410e-02, -2.49947845e-02,
         -1.11752151e-02, -3.63356807e-02, -1.92487369e-02,
         -1.04779126e-01, -5.65091789e-02, -6.64312818e-02,
         -4.43612389e-02, -2.51979248e-03, -2.16810998e-02,
         -3.02161811e-02, -2.20684798e-02, -1.75811106e-02,
         -1.00654428e+00, -3.51785353e-02, -2.94342160e-02,
         -2.04233246e-02, -3.92017292e-02, -8.28353315e-02,
         -9.57895839e-02, -2.07831623e-02, -8.96833935e-03,
         -8.06310320e-02, -2.40000565e-02, -1.34924719e-02,
          1.37752903e+00, -6.94495010e-02, -2.25433637e-02,
         -3.12171460e-02, -7.76661832e-02, -1.05916852e-02,
         -4.02703450e-02, -4.02175843e-02, -2.46103203e-02,
         -3.90117310e-02, -3.18559658e-02, -5.18940506e-02,
         -4.79752068e-02, -1.01934424e-01, -1.43295721e-02,
         -2.15973584e-01, -2.29162006e-02, -1.06911511e-02,
         -3.20881275e-02, -5.48841688e-0

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

logreg_model = LogisticRegression(random_state=42, max_iter=1000)

# Define a small parameter grid for Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']  # Solvers to try
}

# Perform Grid Search with cross-validation using parallel processing
grid_search = GridSearchCV(estimator=logreg_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best ROC-AUC Score: {best_score}")



Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'C': 10, 'solver': 'lbfgs'}
Best ROC-AUC Score: 0.9974080571024654


In [4]:
X_train

Unnamed: 0,amount,merchant_M117188757,merchant_M1198415165,merchant_M1294758098,merchant_M1313686961,merchant_M1352454843,merchant_M1353266412,merchant_M1400236507,merchant_M1416436880,merchant_M151143676,...,industry_home,industry_hotelservices,industry_hyper,industry_leisure,industry_otherservices,industry_sportsandtoys,industry_tech,industry_transportation,industry_travel,industry_wellnessandbeauty
0,21.97,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,15.28,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,11.54,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,66.05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7.56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472487,42.74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
472488,25.45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
472489,0.99,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
472490,8.29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
best_logreg_model = grid_search.best_estimator_
best_logreg_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_test_pred = best_logreg_model.predict(X_test_scaled)
y_test_pred_proba = best_logreg_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print(classification_report(y_test, y_test_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_test_pred_proba)}")

              precision    recall  f1-score   support

       False       1.00      1.00      1.00    116692
        True       0.88      0.73      0.80      1431

    accuracy                           1.00    118123
   macro avg       0.94      0.86      0.90    118123
weighted avg       1.00      1.00      1.00    118123

ROC-AUC Score: 0.9974435140924056


In [13]:
salva=pd.DataFrame({'Y_Test':y_test, 'Y_Test_Predicted':y_test_pred,
                  'Y_Test_Pred_Proba': y_test_pred_proba})
salva.to_csv('/Users/andreapiemontese/Desktop/2Round.nosync/model_predictions_1.csv')

In [7]:
#y_test=pd.read_csv("/Users/andreapiemontese/Desktop/2Round.nosync/model_predictions.csv")['Y_Test']
#y_test_pred=pd.read_csv("/Users/andreapiemontese/Desktop/2Round.nosync/model_predictions.csv")['Y_Test_Predicted']
#y_test_pred_proba=pd.read_csv("/Users/andreapiemontese/Desktop/2Round.nosync/model_predictions.csv")['Y_Test_Pred_Proba']

In [8]:
roc_auc_score(y_test, y_test_pred_proba)

0.9974435140924056

In [9]:
import plotly.figure_factory as ff
import plotly.graph_objects as go

conf_matrix = confusion_matrix(y_test, y_test_pred)
labels = ['Non-Fraud', 'Fraud']
z = conf_matrix
x = labels
y = labels

fig_conf_matrix = ff.create_annotated_heatmap(
    z, x=x, y=y, annotation_text=conf_matrix, colorscale='teal'
)
fig_conf_matrix.update_layout(title_text='Confusion Matrix',
                              xaxis=dict(title='Predicted Label'),
                              yaxis=dict(title='True Label'),
                              plot_bgcolor= 'rgba(0, 0, 0, 0)',
                              paper_bgcolor= 'rgba(0, 0, 0, 0)',template='plotly_dark' )


In [11]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming y_test and y_test_pred_proba are already defined
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
roc_auc = roc_auc_score(y_test, y_test_pred_proba)

# Create the ROC curve plot
fig = go.Figure()

# Add the ROC curve
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', 
                         line=dict(color='teal', width=2), 
                         name='ROC curve (area = %0.2f)' % roc_auc))

# Add the diagonal line
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', 
                         line=dict(color='navy', width=2, dash='dash'), 
                         showlegend=False))

# Update the layout
fig.update_layout(
    xaxis=dict(title='False Positive Rate', range=[0.0, 1.0]),
    yaxis=dict(title='True Positive Rate', range=[0.0, 1.05]),
    title='Receiver Operating Characteristic',
    plot_bgcolor= 'rgba(0, 0, 0, 0)',
    paper_bgcolor= 'rgba(0, 0, 0, 0)',
    template='plotly_dark' 
)

fig.show()


In [14]:
import plotly.graph_objects as go
import pandas as pd

# Assuming best_logreg_model and X_train are already defined
feature_importance = pd.DataFrame(best_logreg_model.coef_[0], index=X_train.columns, columns=['importance']).sort_values(by='importance', ascending=False)
print(feature_importance)
feature_importance.to_csv("/Users/andreapiemontese/Desktop/2Round.nosync/feature_importance_1.csv")

# Create the feature importance bar plot
fig = go.Figure()

# Add the bars
fig.add_trace(go.Bar(
    x=feature_importance.importance,
    y=feature_importance.index,
    orientation='h',
    marker=dict(color=feature_importance.importance, colorscale='Teal')
))

# Update the layout
fig.update_layout(
    title='Feature Importance based on Logistic Regression Coefficients',
    xaxis=dict(title='Importance'),
    yaxis=dict(title='Feature'),
    plot_bgcolor= 'rgba(0, 0, 0, 0)',
    paper_bgcolor= 'rgba(0, 0, 0, 0)',
    template='plotly_dark' 
)

fig.show()


                         importance
merchant_M480139044        1.474977
amount                     1.468785
merchant_M151143676        1.437978
merchant_M209847108        0.789505
industry_hyper             0.740174
...                             ...
merchant_M85975013        -0.356729
industry_food             -0.356729
industry_health           -0.520498
merchant_M1946091778      -0.590706
industry_transportation   -0.708023

[72 rows x 1 columns]


In [15]:
train_df = pd.read_parquet('/Users/andreapiemontese/Desktop/2Round.nosync/parquet_input_files/t_data_train.parquet')
test_df = pd.read_parquet('/Users/andreapiemontese/Desktop/2Round.nosync/parquet_input_files/t_data_test.parquet')


In [16]:
train_df.shape[0],train_df[train_df.fraud==True].shape[0]

(472492, 5769)

In [17]:
train_df[train_df.fraud==True].shape[0]/train_df.shape[0]

0.01220973053512017

In [19]:
test_df.shape[0],test_df[test_df.fraud==True].shape[0]

(118123, 1431)

In [20]:
test_df[test_df.fraud==True].shape[0]/test_df.shape[0]

0.012114490827357924