In [2]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv (r'C:\CodSoft_Projects\archive\fraudTrain.csv')
test_data = pd.read_csv(r'C:\CodSoft_Projects\archive\fraudTest.csv')

# Display the head of the training data
print("Training Data:")
print(train_data.head())

# Display the head of the test data
print("\nTest Data:")
print(test_data.head())


Training Data:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Sui

In [3]:
# Basic information about the dataset
print("\nTraining Data Info:")
print(train_data.info())

# Check for duplicate entries
print("\nNumber of Duplicate Entries in Training Data:", train_data.duplicated().sum())

# Descriptive statistics for numerical features
print("\nTraining Data Description:")
print(train_data.describe())

# Descriptive statistics for categorical features
print("\nCategorical Data Description:")
print(train_data.describe(include=['O']))



Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 1

In [5]:
# Dropping the 'Unnamed: 0' column since it is just index
train_data.drop('Unnamed: 0', axis=1, inplace=True)
test_data.drop('Unnamed: 0', axis=1, inplace=True)


KeyError: "['Unnamed: 0'] not found in axis"

In [7]:
# Checking for missing values
print("\nMissing Values in Training Data:")
print(train_data.isnull().sum())





Missing Values in Training Data:
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [9]:
# Example: Handling outliers in the 'amt' column
# This is a simplistic approach, and you should tailor it to your specific needs
Q1 = train_data['amt'].quantile(0.25)
Q3 = train_data['amt'].quantile(0.75)
IQR = Q3 - Q1
lower_cap = Q1 - 1.5 * IQR
upper_cap = Q3 + 1.5 * IQR

train_data['amt'] = train_data['amt'].clip(lower_cap, upper_cap)

In [10]:
# Final check
print("\nFinal Training Data:")
print(train_data.head())

# Saving the cleaned data
train_data.to_csv('cleaned_fraud_train.csv', index=False)
test_data.to_csv('cleaned_fraud_test.csv', index=False)


Final Training Data:
  trans_date_trans_time            cc_num                            merchant  \
0   2019-01-01 00:00:18  2703186189652095          fraud_Rippin, Kub and Mann   
1   2019-01-01 00:00:44      630423337322     fraud_Heller, Gutmann and Zieme   
2   2019-01-01 00:00:51    38859492057661                fraud_Lind-Buckridge   
3   2019-01-01 00:01:16  3534093764340240  fraud_Kutch, Hermiston and Farrell   
4   2019-01-01 00:03:06   375534208663984                 fraud_Keeling-Crist   

        category      amt      first     last gender  \
0       misc_net    4.970   Jennifer    Banks      F   
1    grocery_pos  107.230  Stephanie     Gill      F   
2  entertainment  193.375     Edward  Sanchez      M   
3  gas_transport   45.000     Jeremy    White      M   
4       misc_pos   41.960      Tyler   Garcia      M   

                         street            city  ...      lat      long  \
0                561 Perry Cove  Moravian Falls  ...  36.0788  -81.1781   
1  4

In [11]:
import pandas as pd

# Loading the cleaned data
train_data = pd.read_csv('cleaned_fraud_train.csv')
test_data = pd.read_csv('cleaned_fraud_test.csv')

# Extracting day of the week, hour from 'trans_date_trans_time'
train_data['trans_date'] = pd.to_datetime(train_data['trans_date_trans_time'])
test_data['trans_date'] = pd.to_datetime(test_data['trans_date_trans_time'])

train_data['day_of_week'] = train_data['trans_date'].dt.dayofweek
test_data['day_of_week'] = test_data['trans_date'].dt.dayofweek

train_data['hour'] = train_data['trans_date'].dt.hour
test_data['hour'] = test_data['trans_date'].dt.hour

# Dropping the original 'trans_date_trans_time' column
train_data.drop('trans_date_trans_time', axis=1, inplace=True)
test_data.drop('trans_date_trans_time', axis=1, inplace=True)


In [13]:
# Selecting features
features = ['amt', 'category', 'gender', 'city_pop', 'merchant', 'lat', 'long', 'job', 'day_of_week', 'hour', 'is_fraud']
train_data = train_data[features]
test_data = test_data[features]

# Separating the target variable
y_train = train_data['is_fraud']
X_train = train_data.drop('is_fraud', axis=1)

y_test = test_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)


In [14]:
# Data Transformation: Normalization, standardization etc. especially because it is a distance based algorithm.

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identifying numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns

# Creating transformers for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundling transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Applying the transformations
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [16]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn (from imblearn)
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
   ---------------------------------------- 0.0/235.6 kB ? eta -:--:--
   ----- ---------------------------------- 30.7/235.6 kB 1.3 MB/s eta 0:00:01
   ------------------- -------------------- 112.6/235.6 kB 1.3 MB/s eta 0:00:01
   --------------------------------- ------ 194.6/235.6 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 235.6/235.6 kB 1.4 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0
Note: you may need


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
from imblearn.over_sampling import SMOTE

# Checking class distribution
print("Before SMOTE, counts of label '1': {}".format(sum(y_train == 1)))
print("Before SMOTE, counts of label '0': {} \n".format(sum(y_train == 0)))

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Checking class distribution after applying SMOTE
print("After SMOTE, counts of label '1': {}".format(sum(y_train == 1)))
print("After SMOTE, counts of label '0': {}".format(sum(y_train == 0)))

Before SMOTE, counts of label '1': 7506
Before SMOTE, counts of label '0': 1289169 

After SMOTE, counts of label '1': 1289169
After SMOTE, counts of label '0': 1289169


In [21]:
"""
This output indicates a significant class imbalance in the dataset: 
originally, there were 7,506 instances of the minority class (label '1', indicating fraud) 
compared to 1,289,169 instances of the majority class (label '0', indicating non-fraudulent transactions). 
After applying SMOTE (Synthetic Minority Over-sampling Technique), 
both classes are balanced with 1,289,169 instances each.

This balancing is crucial for many machine learning models, 
as they can be biased towards the majority class in imbalanced datasets, 
leading to poor classification performance on the minority class. 
By using SMOTE, we synthetically generated new samples in the minority class, 
which helps in improving the classifier's ability to detect fraudulent transactions.

"""

"\nThis output indicates a significant class imbalance in the dataset: \noriginally, there were 7,506 instances of the minority class (label '1', indicating fraud) \ncompared to 1,289,169 instances of the majority class (label '0', indicating non-fraudulent transactions). \nAfter applying SMOTE (Synthetic Minority Over-sampling Technique), \nboth classes are balanced with 1,289,169 instances each.\n\nThis balancing is crucial for many machine learning models, \nas they can be biased towards the majority class in imbalanced datasets, \nleading to poor classification performance on the minority class. \nBy using SMOTE, we synthetically generated new samples in the minority class, \nwhich helps in improving the classifier's ability to detect fraudulent transactions.\n\n"

In [22]:
from sklearn.model_selection import train_test_split


# Splitting the dataset into the Training set and Validation set
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(f"Training set size: {X_train_final.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")


Training set size: 2062670 samples
Validation set size: 515668 samples


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Building the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_final, y_train_final)

# Predicting the Validation set results
y_pred_val_logreg = logreg.predict(X_val)

# Evaluating the Logistic Regression model
print("Logistic Regression - Validation Set:")
print(classification_report(y_val, y_pred_val_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_logreg))

Logistic Regression - Validation Set:
              precision    recall  f1-score   support

           0       0.81      0.81      0.81    257186
           1       0.82      0.82      0.82    258482

    accuracy                           0.82    515668
   macro avg       0.82      0.82      0.82    515668
weighted avg       0.82      0.82      0.82    515668

Confusion Matrix:
 [[209515  47671]
 [ 47703 210779]]


In [24]:
from sklearn.tree import DecisionTreeClassifier

# Building the Decision Tree model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_final, y_train_final)

# Predicting the Validation set results
y_pred_val_dtree = dtree.predict(X_val)

# Evaluating the Decision Tree model
print("Decision Tree - Validation Set:")
print(classification_report(y_val, y_pred_val_dtree))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_dtree))

Decision Tree - Validation Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257186
           1       1.00      1.00      1.00    258482

    accuracy                           1.00    515668
   macro avg       1.00      1.00      1.00    515668
weighted avg       1.00      1.00      1.00    515668

Confusion Matrix:
 [[256046   1140]
 [   588 257894]]


In [26]:
# Choosing decision tree as the final model since it has a near perfect score on the validation set.

dtree.fit(X_train, y_train)  # Re-train on the full training set


In [28]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Your existing code for prediction and evaluation
y_pred_test = dtree.predict(X_test)

print("Decision Tree - Test Set Evaluation:")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

# Compute ROC-AUC
roc_auc = roc_auc_score(y_test, dtree.predict_proba(X_test)[:, 1])
print("ROC-AUC Score:", roc_auc)

Decision Tree - Test Set Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.25      0.34      0.29      2145

    accuracy                           0.99    555719
   macro avg       0.63      0.67      0.64    555719
weighted avg       0.99      0.99      0.99    555719

Confusion Matrix:
 [[551438   2136]
 [  1418    727]]
ROC-AUC Score: 0.6674572870711057


In [29]:
"""
The model is almost perfect in identifying non-fraudulent transactions (Class 0) 
with precision and recall both close to 1.00.

For fraudulent transactions (Class 1), the precision is only 0.25, and recall is 0.34. 
This means the model correctly identifies only 34% of fraudulent transactions, 
and when it predicts a transaction as fraudulent, it is correct only 25% of the time.

The model has a high overall accuracy of 0.99, 
but this metric is less informative due to the imbalanced nature of the dataset 
(where non-fraudulent transactions vastly outnumber fraudulent ones).

The model has a high number of false negatives (fraudulent transactions missed) 
and a moderate number of false positives 
(non-fraudulent transactions incorrectly labeled as fraudulent).

The ROC-AUC score of 0.667 indicates moderate ability of the model 
to distinguish between fraudulent and non-fraudulent transactions.

"""

'\nThe model is almost perfect in identifying non-fraudulent transactions (Class 0) \nwith precision and recall both close to 1.00.\n\nFor fraudulent transactions (Class 1), the precision is only 0.25, and recall is 0.34. \nThis means the model correctly identifies only 34% of fraudulent transactions, \nand when it predicts a transaction as fraudulent, it is correct only 25% of the time.\n\nThe model has a high overall accuracy of 0.99, \nbut this metric is less informative due to the imbalanced nature of the dataset \n(where non-fraudulent transactions vastly outnumber fraudulent ones).\n\nThe model has a high number of false negatives (fraudulent transactions missed) \nand a moderate number of false positives \n(non-fraudulent transactions incorrectly labeled as fraudulent).\n\nThe ROC-AUC score of 0.667 indicates moderate ability of the model \nto distinguish between fraudulent and non-fraudulent transactions.\n\n'

In [30]:
# Since, uncertainty about the types of objects needing serialization so.pkl is used to handle the wide variety.

import joblib

# Since 'dtree' is our trained Decision Tree model
joblib.dump(dtree, 'decision_tree_model.pkl')

['decision_tree_model.pkl']

In [33]:
def get_user_input():
    # Asking user input for each feature
    amt = float(input("Enter transaction amount: "))
    category = input("Enter transaction category (e.g., 'misc_net', 'grocery_pos', etc.): ")
    gender = input("Enter gender (M/F): ")
    city_pop = int(input("Enter city population: "))
    merchant = input("Enter merchant name: ")
    lat = float(input("Enter latitude of the transaction location: "))
    long = float(input("Enter longitude of the transaction location: "))
    job = input("Enter job title: ")
    dob = input("Enter date of birth (YYYY-MM-DD): ")
    merch_lat = float(input("Enter merchant's latitude: "))
    merch_long = float(input("Enter merchant's longitude: "))

    # Handling date and time input
    correct_format = False
    while not correct_format:
        trans_date_trans_time = input("Enter transaction date and time (YYYY-MM-DD HH:MM:SS): ")
        try:
            # Try to parse the date and time
            parsed_date = pd.to_datetime(trans_date_trans_time)
            correct_format = True
        except ValueError:
            # If parsing fails, inform the user and prompt again
            print("Date and time format is incorrect. Please use YYYY-MM-DD HH:MM:SS format.")

    day_of_week = parsed_date.day_name()
    hour = parsed_date.hour

    user_input = {
        'amt': amt,
        'category': category,
        'gender': gender,
        'city_pop': city_pop,
        'merchant': merchant,
        'lat': lat,
        'long': long,
        'job': job,
        'dob': dob,
        'merch_lat': merch_lat,
        'merch_long': merch_long,
        'day_of_week': day_of_week,
        'hour': hour
    }

    return user_input

def load_model_and_predict(input_data):
    # Load the saved model
    model = joblib.load('decision_tree_model.pkl')

    # Convert input_data to the format the model expects (e.g., DataFrame)
    input_df = pd.DataFrame([input_data])

    # Make a prediction
    prediction = model.predict(input_df)
    return prediction[0]  # Returning the first (and only) prediction


In [34]:
# Get user input
user_input = get_user_input()

# Make a prediction
prediction = load_model_and_predict(user_input)
if prediction == 1:
    print("The transaction is predicted to be fraudulent.")
else:
    print("The transaction is predicted to be legitimate.")



ValueError: could not convert string to float: 'grocery_pos'