In [86]:
# Ignore warnings
import warnings
warnings.simplefilter("ignore")

In [87]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression


---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [88]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
orig_df = pd.read_csv(
    Path(r"C:\Users\ksarn\OneDrive\Desktop\Project 4\Fraudulent_E-Commerce_Transaction_Data.csv")
)

# Review the DataFrame
print(orig_df.nunique())
orig_df.head()

Transaction ID        1472952
Customer ID           1472952
Transaction Amount     108998
Transaction Date      1346684
Payment Method              4
Product Category            5
Quantity                    5
Customer Age               97
Customer Location       99135
Device Used                 3
IP Address            1472651
Shipping Address      1472948
Billing Address       1472949
Is Fraudulent               2
Account Age Days          365
Transaction Hour           24
dtype: int64


Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
0,15d2e414-8735-46fc-9e02-80b472b2580f,d1b87f62-51b2-493b-ad6a-77e0fe13e785,58.09,2024-02-20 05:58:41,bank transfer,electronics,1,17,Amandaborough,tablet,212.195.49.198,Unit 8934 Box 0058\nDPO AA 05437,Unit 8934 Box 0058\nDPO AA 05437,0,30,5
1,0bfee1a0-6d5e-40da-a446-d04e73b1b177,37de64d5-e901-4a56-9ea0-af0c24c069cf,389.96,2024-02-25 08:09:45,debit card,electronics,2,40,East Timothy,desktop,208.106.249.121,"634 May Keys\nPort Cherylview, NV 75063","634 May Keys\nPort Cherylview, NV 75063",0,72,8
2,e588eef4-b754-468e-9d90-d0e0abfc1af0,1bac88d6-4b22-409a-a06b-425119c57225,134.19,2024-03-18 03:42:55,PayPal,home & garden,2,22,Davismouth,tablet,76.63.88.212,"16282 Dana Falls Suite 790\nRothhaven, IL 15564","16282 Dana Falls Suite 790\nRothhaven, IL 15564",0,63,3
3,4de46e52-60c3-49d9-be39-636681009789,2357c76e-9253-4ceb-b44e-ef4b71cb7d4d,226.17,2024-03-16 20:41:31,bank transfer,clothing,5,31,Lynnberg,desktop,207.208.171.73,"828 Strong Loaf Apt. 646\nNew Joshua, UT 84798","828 Strong Loaf Apt. 646\nNew Joshua, UT 84798",0,124,20
4,074a76de-fe2d-443e-a00c-f044cdb68e21,45071bc5-9588-43ea-8093-023caec8ea1c,121.53,2024-01-15 05:08:17,bank transfer,clothing,2,51,South Nicole,tablet,190.172.14.169,"29799 Jason Hills Apt. 439\nWest Richardtown, ...","29799 Jason Hills Apt. 439\nWest Richardtown, ...",0,158,5


In [115]:
# # Set up working dataframe
# working_df = pd.DataFrame(orig_df)

# # Drop columns containing non-unique information
# working_df = working_df.drop(columns={'Transaction ID', 'Customer ID', 'IP Address'})

# working_df = working_df['Transaction Date'] = pd.to_datetime(working_df['Transaction Date'])
# working_df['year'] = working_df['trxn_date'].dt.year
# working_df['month'] = working_df['trxn_date'].dt.month
# working_df['day'] = working_df['trxn_date'].dt.day
# # Rename remaining columns for eas of use                           
# # <<TBekah note to team--------------This step was for my own ease only. Column Names can absolutely be changed, as long as the below are updated to match.>>
# working_df = working_df.rename(columns={'Transaction Amount': 'trxn_amt', 
#                                         'Transaction Date': 'trxn_date', 
#                                         'Payment Method': 'pmt_method', 
#                                         'Product Category': 'product_cat', 
#                                         'Quantity': 'qty',
#                                         'Customer Age': 'cust_age',
#                                         'Customer Location': 'cust_loc',
#                                         'Device Used': 'cust_device',
#                                         'Shipping Address': 'ship_addr',
#                                         'Billing Address': 'bill_addr',
#                                         'Is Fraudulent': 'fraudulent',
#                                         'Account Age Days': 'acct_age_days',
#                                         'Transaction Hour': 'trxn_hour'
#                                         })


# working_df.head()



# Set up working dataframe
working_df = pd.DataFrame(orig_df)

# Drop columns containing non-unique information
working_df = working_df.drop(columns=['Transaction ID', 'Customer ID', 'IP Address'])

# Convert to datetime
working_df['Transaction Date'] = pd.to_datetime(working_df['Transaction Date'])


working_df['year'] = working_df['Transaction Date'].dt.year
working_df['month'] = working_df['Transaction Date'].dt.month
working_df['day'] = working_df['Transaction Date'].dt.day




# Rename 
working_df = working_df.rename(columns={
    'Transaction Amount': 'trxn_amt', 
    'Transaction Date': 'trxn_date', 
    'Payment Method': 'pmt_method', 
    'Product Category': 'product_cat', 
    'Quantity': 'qty',
    'Customer Age': 'cust_age',
    'Customer Location': 'cust_loc',
    'Device Used': 'cust_device',
    'Shipping Address': 'ship_addr',
    'Billing Address': 'bill_addr',
    'Is Fraudulent': 'fraudulent',
    'Account Age Days': 'acct_age_days',
    'Transaction Hour': 'trxn_hour'
})
working_df = working_df.drop(columns=['trxn_date', 'cust_loc', 'ship_addr', 'bill_addr'])
# Display the first few rows



working_df.head()


Unnamed: 0,trxn_amt,pmt_method,product_cat,qty,cust_age,cust_device,fraudulent,acct_age_days,trxn_hour,year,month,day
0,58.09,bank transfer,electronics,1,17,tablet,0,30,5,2024,2,20
1,389.96,debit card,electronics,2,40,desktop,0,72,8,2024,2,25
2,134.19,PayPal,home & garden,2,22,tablet,0,63,3,2024,3,18
3,226.17,bank transfer,clothing,5,31,desktop,0,124,20,2024,3,16
4,121.53,bank transfer,clothing,2,51,tablet,0,158,5,2024,1,15


In [117]:
from sklearn.preprocessing import LabelEncoder

encoded_df = pd.get_dummies(working_df, columns=['pmt_method', 'product_cat', 'cust_device'], drop_first=True)
label_encoder = LabelEncoder()
working_df['pmt_method'] = label_encoder.fit_transform(working_df['pmt_method'])
working_df['product_cat'] = label_encoder.fit_transform(working_df['product_cat'])
working_df['cust_device'] = label_encoder.fit_transform(working_df['cust_device'])
encoded_df

Unnamed: 0,trxn_amt,qty,cust_age,fraudulent,acct_age_days,trxn_hour,year,month,day,pmt_method_1,pmt_method_2,pmt_method_3,product_cat_1,product_cat_2,product_cat_3,product_cat_4,cust_device_1,cust_device_2
0,58.09,1,17,0,30,5,2024,2,20,True,False,False,True,False,False,False,False,True
1,389.96,2,40,0,72,8,2024,2,25,False,False,True,True,False,False,False,False,False
2,134.19,2,22,0,63,3,2024,3,18,False,False,False,False,False,True,False,False,True
3,226.17,5,31,0,124,20,2024,3,16,True,False,False,False,False,False,False,False,False
4,121.53,2,51,0,158,5,2024,1,15,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472947,208.48,2,29,0,149,23,2024,1,12,False,False,False,False,True,False,False,False,False
1472948,231.57,2,32,0,132,4,2024,3,27,False,True,False,False,False,False,False,False,True
1472949,101.80,4,36,0,98,23,2024,1,31,True,False,False,True,False,False,False,True,False
1472950,61.80,5,34,0,191,16,2024,1,12,True,False,False,True,False,False,False,False,False


In [118]:
# # Set up variables corresponding to columns that aren't float type
# pmt_method = working_df['pmt_method']
# product_cat = working_df['product_cat']
# cust_loc = working_df['cust_loc']
# cust_device = working_df['cust_device']

# # Set up dataframes containing unique values for each new table
# pmt_method_df = pd.DataFrame(pmt_method.unique())
# product_cat_df = pd.DataFrame(product_cat.unique())
# cust_loc_df = pd.DataFrame(cust_loc.unique())
# cust_device_df = pd.DataFrame(cust_device.unique())

# # <<Bekah note to team ----------Do something with addresses? Index by state? Country?>>

# # Create a list of new tables
#new_tables = [pmt_method_df, product_cat_df, cust_loc_df, cust_device_df]
#working_df = pd.get_dummies(working_df, columns=['pmt_method', 'product_cat', 'cust_device'], drop_first=True)
print(working_df.dtypes)

trxn_amt         float64
pmt_method         int64
product_cat        int64
qty                int64
cust_age           int64
cust_device        int64
fraudulent         int64
acct_age_days      int64
trxn_hour          int64
year               int32
month              int32
day                int32
dtype: object


In [119]:
# Loop through the new tables to create index column and update column headers
for tbl in new_tables:

    # Create an index column for the data, adding one for easier human understanding
    tbl['index'] = tbl.index + 1

    # Update '0' column header
    tbl.rename(columns={0: 'unique', '0': 'unique'}, inplace=True)

In [120]:
# Create a dictionary of the existing column with its new indeces
new_value_dict = {'pmt_methods': [pmt_method, pmt_method_df], 
                  'product_cats': [product_cat, product_cat_df], 
                  'cust_loc': [cust_loc, cust_loc_df], 
                  'cust_device': [cust_device, cust_device_df]
                  }

new_value_dict['pmt_methods'][1]

Unnamed: 0,unique,index
0,1,1
1,3,2
2,0,3
3,2,4


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [56]:
# <<Bekah note to team -------------------- All code below is a copy/paste from a different project - it will probably require small tweaks once the data above is ready.>>

In [95]:
# Separate the data into labels and features

# # Separate the y variable, the labels
# y = working_df['fraudulent']

# # Separate the X variable, the features
# X = working_df.drop(columns={'fraudulent', 'ship_addr', 'bill_addr'})
X = working_df.drop(columns=['fraudulent'])
y = working_df['fraudulent']

In [96]:
# Review the y variable Series
y.value_counts()

fraudulent
0    1399114
1      73838
Name: count, dtype: int64

In [97]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,trxn_amt,qty,cust_age,acct_age_days,trxn_hour,year,month,day,pmt_method_1,pmt_method_2,pmt_method_3,product_cat_1,product_cat_2,product_cat_3,product_cat_4,cust_device_1,cust_device_2
0,58.09,1,17,30,5,2024,2,20,True,False,False,True,False,False,False,False,True
1,389.96,2,40,72,8,2024,2,25,False,False,True,True,False,False,False,False,False
2,134.19,2,22,63,3,2024,3,18,False,False,False,False,False,True,False,False,True
3,226.17,5,31,124,20,2024,3,16,True,False,False,False,False,False,False,False,False
4,121.53,2,51,158,5,2024,1,15,True,False,False,False,False,False,False,False,True


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [98]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [122]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000)

# Fit the model using training data
classifier.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [101]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
predictions


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [102]:
# Generate a confusion matrix for the model
confusion_matrix(predictions,y_test)

array([[349336,  16506],
       [   295,   2101]], dtype=int64)

In [103]:
# Print the classification report for the model
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98    365842
           1       0.11      0.88      0.20      2396

    accuracy                           0.95    368238
   macro avg       0.56      0.92      0.59    368238
weighted avg       0.99      0.95      0.97    368238



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** This model is currently skewed in favor of the client. Healthy loans were all found and recorded as healthy, and 99% of the returned positives should have been positive. The f1-score was 100%, meaning this metric is solid. When looking at high-risk loans, however, the f1 score was only 89%. The success rate of the model might be helped by adding a random forest model, but also might be skewed by the large disparity between healty vs high-risk support.

In [104]:
#Dependencies
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier



In [105]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [106]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=50, random_state=1)

# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())


In [108]:
# Make predictions using the testing data
predictions_rf = rf_model.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_rf)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Print the classification report for the model
print(classification_report(predictions_rf, y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.05      0.10    368238

    accuracy                           0.05    368238
   macro avg       0.50      0.03      0.05    368238
weighted avg       1.00      0.05      0.10    368238



---