In [4]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the Dataset

In [None]:
# Load the data from the CSV file into a pandas DataFrame.
df = pd.read_csv('credit card.csv')
print("✅ Dataset loaded successfully!")

# 2. Initial Data Inspection

In [6]:
# Get a first look at the data's structure and content.
print("\n--- Initial Data Inspection ---")
# Display the first 5 rows of the dataset
print("\n📋 First 5 Rows (Head):")
df.head()


--- Initial Data Inspection ---

📋 First 5 Rows (Head):


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [7]:
# Get a concise summary of the DataFrame, including data types and non-null values.
print("\nℹ️ Data Info:")
df.info()


ℹ️ Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            3142 non-null   int64  
 1   type            3142 non-null   object 
 2   amount          3142 non-null   float64
 3   nameOrig        3142 non-null   object 
 4   oldbalanceOrg   3142 non-null   float64
 5   newbalanceOrig  3142 non-null   float64
 6   nameDest        3142 non-null   object 
 7   oldbalanceDest  3142 non-null   float64
 8   newbalanceDest  3142 non-null   float64
 9   isFraud         3142 non-null   int64  
 10  isFlaggedFraud  3142 non-null   int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 270.1+ KB


In [9]:
# Generate descriptive statistics for numerical columns.
print("\n📊 Descriptive Statistics:")
df.describe()


📊 Descriptive Statistics:


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0
mean,18.181731,504175.3,938007.8,516634.2,603450.2,1038512.0,0.367919,0.0
std,27.917929,1345891.0,2066634.0,1657465.0,1973508.0,2815902.0,0.482316,0.0
min,1.0,8.73,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,6527.698,4255.75,0.0,0.0,0.0,0.0,0.0
50%,1.0,61169.01,50752.0,0.0,0.0,0.0,0.0,0.0
75%,33.0,324074.7,567932.3,26219.73,269784.7,638021.6,1.0,0.0
max,95.0,10000000.0,19900000.0,10200000.0,33000000.0,34600000.0,1.0,0.0


In [10]:
# Check for any missing values in the dataset.
print("\n❓ Missing Values:")
print(df.isnull().sum())


❓ Missing Values:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


# 3. Analyzing Fraudulent Transactions

In [11]:
# Understand the distribution of the target variable 'isFraud'.
print("\n--- Fraud Analysis ---")
# Count the occurrences of fraudulent vs. non-fraudulent transactions.
fraud_counts = df['isFraud'].value_counts()
print("\nFraudulent vs. Non-Fraudulent Counts:")
print(fraud_counts)


--- Fraud Analysis ---

Fraudulent vs. Non-Fraudulent Counts:
isFraud
0    1986
1    1156
Name: count, dtype: int64


In [12]:
# Visualize the distribution of fraudulent transactions.
plt.figure(figsize=(7, 6))
sns.countplot(x='isFraud', data=df, palette=['#3498db', '#e74c3c'])
plt.title('Distribution of Fraudulent vs. Non-Fraudulent Transactions', fontsize=16)
plt.xlabel('Is Fraud? (0: No, 1: Yes)', fontsize=12)
plt.ylabel('Number of Transactions', fontsize=12)
plt.xticks([0, 1], ['Non-Fraudulent', 'Fraudulent'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.show() # Uncomment to display the plot
plt.savefig('fraud_distribution.png')
plt.clf()
print("\n📈 Saved plot: fraud_distribution.png")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='isFraud', data=df, palette=['#3498db', '#e74c3c'])



📈 Saved plot: fraud_distribution.png


<Figure size 700x600 with 0 Axes>

# 4. Analyzing Transaction Types

In [13]:
# Explore the different types of transactions and their relation to fraud.
print("\n--- Transaction Type Analysis ---")
# Count the number of transactions for each type.
type_counts = df['type'].value_counts()
print("\nTransaction Type Counts:")
print(type_counts)


--- Transaction Type Analysis ---

Transaction Type Counts:
type
PAYMENT     998
CASH_OUT    926
TRANSFER    791
CASH_IN     311
DEBIT       116
Name: count, dtype: int64


In [14]:
# Visualize the distribution of transaction types.
plt.figure(figsize=(10, 6))
sns.countplot(x='type', data=df, order=type_counts.index, palette='viridis')
plt.title('Distribution of Transaction Types', fontsize=16)
plt.xlabel('Transaction Type', fontsize=12)
plt.ylabel('Number of Transactions', fontsize=12)
# plt.show() # Uncomment to display the plot
plt.savefig('transaction_types.png')
plt.clf()
print("📈 Saved plot: transaction_types.png")

📈 Saved plot: transaction_types.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='type', data=df, order=type_counts.index, palette='viridis')


<Figure size 1000x600 with 0 Axes>

In [15]:
# Visualize fraudulent transactions broken down by type.
plt.figure(figsize=(10, 6))
sns.countplot(x='type', hue='isFraud', data=df, order=type_counts.index, palette=['#3498db', '#e74c3c'])
plt.title('Fraudulent Transactions by Type', fontsize=16)
plt.xlabel('Transaction Type', fontsize=12)
plt.ylabel('Number of Transactions', fontsize=12)
plt.legend(title='Is Fraud?', labels=['Non-Fraudulent', 'Fraudulent'])
# plt.show() # Uncomment to display the plot
plt.savefig('fraud_by_type.png')
plt.clf()
print("📈 Saved plot: fraud_by_type.png")

📈 Saved plot: fraud_by_type.png


<Figure size 1000x600 with 0 Axes>

# 5. Correlation Analysis

In [17]:
# Investigate the relationships between numerical features.

print("\n--- Correlation Analysis ---")
# Calculate the correlation matrix for numerical columns.
correlation_matrix = df.corr(numeric_only=True)

# Plot a heatmap of the correlation matrix.
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Features', fontsize=16)
# plt.show() # Uncomment to display the plot
plt.savefig('correlation_matrix.png')
plt.clf()
print("📈 Saved plot: correlation_matrix.png")

# Display the correlation values with the 'isFraud' column, sorted.
print("\nCorrelation with 'isFraud':")
print(correlation_matrix['isFraud'].sort_values(ascending=False))


--- Correlation Analysis ---
📈 Saved plot: correlation_matrix.png

Correlation with 'isFraud':
isFraud           1.000000
step              0.806796
amount            0.383737
oldbalanceOrg     0.099037
newbalanceDest    0.009942
oldbalanceDest   -0.060313
newbalanceOrig   -0.222409
isFlaggedFraud         NaN
Name: isFraud, dtype: float64


<Figure size 1200x1000 with 0 Axes>