In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv',parse_dates=['trans_date_trans_time',])
test_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from time import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
train_data.head()

In [None]:
#First checking for duplicate and null values:

print(train_data[train_data.duplicated() == True])
#as we can see here, we have no duplicated rows

print(train_data.isnull().sum())
#as we can see here, we have no null value inside any column

In [None]:
print(train_data.dtypes)

In [None]:
data = train_data['is_fraud'].value_counts()

plt.pie(data, labels=data.index, autopct='%1.1f%%')
plt.title("Value Distribution Of The Target Variable")
plt.show()


In [None]:
train_data.describe()

In [None]:
print('Summary statistics of non-fraudulent transactions:')
non_fraud_dataSet = train_data[train_data['is_fraud'] == 0]
non_fraud_dataSet.describe()

In [None]:
print('Summary statistics of fraudulent transactions:')
fraud_dataSet = train_data[train_data['is_fraud'] == 1]
fraud_dataSet.describe()

In [None]:
sns.boxplot(x = train_data['amt'])
plt.title('amt box plot')
plt.xlabel('Total Amount')
plt.show()


In [None]:
non_fraud_dataSet = train_data[train_data.is_fraud == 0]
data = non_fraud_dataSet['amt']
plt.figure(figsize=(10, 6))
plt.hist(data, bins = 100)
plt.title('Frequency of Transaction Amounts Across Non-Fraudulent Transactions')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
fraud_dataSet = train_data[train_data.is_fraud == 1]
data = fraud_dataSet['amt']
plt.figure(figsize=(10, 6))
plt.hist(data, bins = 100)
plt.title('Frequency of Transaction Amounts Across Fraudulent Transactions')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()


In [None]:
data = train_data['gender'].value_counts()

plt.pie(data, labels=data.index, autopct='%1.1f%%')
plt.title("Value Distribution Of The Gender")
plt.figure(figsize=(10, 6))
plt.show()


In [None]:
fraud_dataSet = train_data[train_data['is_fraud'] == 1]

data = fraud_dataSet['category']
plt.figure(figsize=(10, 6))
plt.hist(data, bins = 100)
plt.title('Frequency of Fraudulent Transactions Across Category Types')
plt.xlabel('Category Types')
plt.xticks(rotation=45)
plt.ylabel('Frequency')
plt.show()

Adding Hour Features

In [None]:
train_data['hour'] = train_data.trans_date_trans_time.dt.hour

In [None]:
train_data['hourEnc'] = 0
train_data.loc[train_data.hour < 5,'hourEnc'] = 1
train_data.loc[train_data.hour > 21,'hourEnc'] = 1

In [None]:
# Assuming 'trans_date_trans_time' is a datetime column
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])

# Sorting the DataFrame based on the transaction time
train_data.sort_values(by='trans_date_trans_time', inplace=True)

# Adding frequencies for last 1, 7, and 30 days
train_data['transactions_last_1d'] = train_data['trans_date_trans_time'].rolling(window=1).count()
train_data['transactions_last_7d'] = train_data['trans_date_trans_time'].rolling(window=7).count()
train_data['transactions_last_30d'] = train_data['trans_date_trans_time'].rolling(window=30).count()

# Filling NaN values with 0 for the initial rows with no history
train_data['transactions_last_1d'].fillna(0, inplace=True)
train_data['transactions_last_7d'].fillna(0, inplace=True)
train_data['transactions_last_30d'].fillna(0, inplace=True)

In [None]:
# Assuming credit_card_data is your DataFrame
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])

def last1DayTransactionCount(group):
    return group['trans_date_trans_time'].rolling(window=1).count()

def last7DaysTransactionCount(group):
    return group['trans_date_trans_time'].rolling(window=7).count()

def last30DaysTransactionCount(group):
    return group['trans_date_trans_time'].rolling(window=30).count()

# Group by 'cc_num' and apply the custom functions
df1 = train_data.groupby('cc_num').apply(last1DayTransactionCount)
df1 = train_data.groupby('cc_num').apply(last7DaysTransactionCount)
df1 = train_data.groupby('cc_num').apply(last30DaysTransactionCount)

# Resetting the index to obtain a DataFrame
df1 = df1.reset_index(level=0, drop=True)

In [None]:
# Assuming credit_card_data is your DataFrame
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])

def last1DayTransactionCount(group):
    return group.resample('1D', on='trans_date_trans_time').size()

def last7DaysTransactionCount(group):
    return group.resample('7D', on='trans_date_trans_time').size()

def last30DaysTransactionCount(group):
    return group.resample('30D', on='trans_date_trans_time').size()

def timeSinceLastTransaction(group):
    return group['trans_date_trans_time'].diff().dt.total_seconds()

# Group by 'cc_num' and apply the custom functions
df1 = train_data.groupby('cc_num').apply(last1DayTransactionCount)
df1 = train_data.groupby('cc_num').apply(last7DaysTransactionCount)
df1 = train_data.groupby('cc_num').apply(last30DaysTransactionCount)
df1['time_diff'] = train_data.groupby('cc_num').apply(timeSinceLastTransaction)

# Resetting the index to obtain a DataFrame
df1 = df1.reset_index(level=0, drop=True)

**Displaying the correlation between the features**

In [None]:
# Drop non-numeric columns
numeric_data = train_data.select_dtypes(include=['number'])

# Calculate correlation matrix
correlation_matrix = numeric_data.corr()

# Plotting a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
numeric_columns = train_data.select_dtypes(include='number')
fraud_correlation = numeric_columns.corr()['is_fraud'].abs().sort_values(ascending=False)

print(fraud_correlation)

In [None]:
# Load train and test datasets
train_ds = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
test_ds = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')

# Drop non-numeric and non-binary columns for simplicity
numeric_columns_train = train_ds.select_dtypes(include='number')
numeric_columns_test = test_ds.select_dtypes(include='number')

X_train = numeric_columns_train.drop(['is_fraud'], axis=1)
y_train = train_ds['is_fraud']

X_test = numeric_columns_test.drop(['is_fraud'], axis=1)
y_test = test_ds['is_fraud']

# Preprocess the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")
