# Scam Detection Machine Learning Model

In [12]:
import pandas as pd
import numpy as np
import random
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler

### Generating a Toy Dataset for Scams

In [13]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [23]:
import numpy as np
import random
from datetime import datetime, timedelta
import pandas as pd
from faker import Faker

fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define number of records
num_records = 10000

# Helper functions for generating data
def generate_transaction_amount(is_scam):
    if is_scam:
        return np.round(np.random.uniform(50, 5000), 2)  # Larger amounts for scams
    else:
        return np.round(np.random.uniform(10, 1000), 2)  # Smaller amounts for normal transactions

def generate_transaction_datetime():
    # Simulate more transactions during weekdays and less on weekends
    weekday_prob = 0.8
    if random.random() < weekday_prob:
        # Random weekday time
        day = np.random.randint(0, 5)  # Monday to Friday
        hour = np.random.randint(8, 18)  # Business hours
    else:
        # Random weekend time
        day = np.random.randint(5, 7)  # Saturday and Sunday
        hour = np.random.randint(8, 20)  # Wider range on weekends
    minute = np.random.randint(0, 60)
    second = np.random.randint(0, 60)
    return datetime(2023, 1, 1) + timedelta(days=day, hours=hour, 
                                            minutes=minute, seconds=second)

def generate_is_scam_prob():
    # Generate a probability for a transaction to be a scam
    return np.random.uniform(0, 1)

# Function to generate scam names based on certain patterns
def generate_scam_name():
    return fake.first_name() + " " + fake.last_name_nonbinary()

# Function to generate non-scam names based on common naming conventions
def generate_non_scam_name():
    return fake.first_name() + " " + fake.last_name()

# Function to generate names based on scam status
def generate_name(is_scam):
    if is_scam:
        return generate_scam_name()
    else:
        return generate_non_scam_name()

# Function to generate scam messages
def generate_scam_message():
    return fake.sentence(nb_words=6) + " Urgent action required!"

# Function to generate non-scam messages
def generate_non_scam_message():
    return "Thank you for your recent transaction. Your account balance is updated."

# Function to generate messages based on scam status
def generate_message(is_scam):
    if is_scam:
        return generate_scam_message()
    else:
        return generate_non_scam_message()

# Function to determine the scam possibility based on the probability
def determine_scam_possibility(prob):
    if 0.9 < prob < 1:
        return 'Highest'
    elif 0.5 < prob < 0.8:
        return 'Moderate'
    elif prob <= 0.5:
        return 'Low'
    else:
        return 'Lowest'

# Generate data
data = []
for _ in range(num_records):
    is_scam_prob = generate_is_scam_prob()
    transaction_amount = generate_transaction_amount(is_scam_prob > 0.5)
    transaction_datetime = generate_transaction_datetime()
    recipient_name = generate_name(is_scam_prob > 0.5)
    sender_name = generate_name(is_scam_prob > 0.5)
    message = generate_message(is_scam_prob > 0.5)
    scam_possibility = determine_scam_possibility(is_scam_prob)
    data.append({
        'transaction_amount': transaction_amount,
        'transaction_datetime': transaction_datetime,
        'is_scam_prob': is_scam_prob,
        'scam_possibility': scam_possibility,
        'recipient_name': recipient_name,
        'sender_name': sender_name,
        'message': message
    })

# Create DataFrame
df = pd.DataFrame(data)

# Save the toy dataset to CSV
toy_dataset_path = 'toy_dataset_with_probabilities.csv'
df.to_csv(toy_dataset_path, index=False)

print(f"Toy dataset saved to {toy_dataset_path}")

Toy dataset saved to toy_dataset_with_probabilities.csv


In [24]:
data = df

In [25]:
data.head()

Unnamed: 0,transaction_amount,transaction_datetime,is_scam_prob,scam_possibility,recipient_name,sender_name,message
0,951.21,2023-01-03 15:20:38,0.37454,Low,Michele Davenport,Victoria Bennett,Thank you for your recent transaction. Your ac...
1,108.98,2023-01-03 15:52:35,0.445833,Low,Courtney Alvarez,Kara Morrow,Thank you for your recent transaction. Your ac...
2,654.38,2023-01-05 09:23:43,0.142867,Low,Anthony Hardin,William Reed,Thank you for your recent transaction. Your ac...
3,53.85,2023-01-04 12:32:11,0.938553,Highest,Joanna Shepherd,Thomas West,Center meeting million machine. Urgent action ...
4,84.98,2023-01-01 08:26:58,0.611653,Moderate,Justin Guerra,Micheal Harris,Case meet improve know dog cost down. Urgent a...


In [17]:
import gensim
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing to the text columns
data['message_tokens'] = data['message'].apply(preprocess_text)
data['recipient_name_tokens'] = data['recipient_name'].apply(preprocess_text)
data['sender_name_tokens'] = data['sender_name'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# Train a Word2Vec model on the tokens
all_tokens = data['message_tokens'].tolist() + data['recipient_name_tokens'].tolist() + data['sender_name_tokens'].tolist()
word2vec_model = gensim.models.Word2Vec(all_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Function to get the embedding for a text
def get_text_embedding(tokens, model):
    embedding = np.mean([model.wv[word] for word in tokens if word in model.wv], axis=0)
    if isinstance(embedding, np.ndarray):
        return embedding
    else:
        return np.zeros(model.vector_size)

# Get embeddings for the text columns
data['message_embedding'] = data['message_tokens'].apply(lambda x: get_text_embedding(x, word2vec_model).tolist())
data['recipient_name_embedding'] = data['recipient_name_tokens'].apply(lambda x: get_text_embedding(x, word2vec_model).tolist())
data['sender_name_embedding'] = data['sender_name_tokens'].apply(lambda x: get_text_embedding(x, word2vec_model).tolist())

# Drop the token columns
data = data.drop(columns=['message_tokens', 'recipient_name_tokens', 'sender_name_tokens'])


In [19]:
# Convert embedding columns from lists to arrays of floats
data['message_embedding'] = data['message_embedding'].apply(lambda x: np.array(x, dtype=np.float32))
data['recipient_name_embedding'] = data['recipient_name_embedding'].apply(lambda x: np.array(x, dtype=np.float32))
data['sender_name_embedding'] = data['sender_name_embedding'].apply(lambda x: np.array(x, dtype=np.float32))

# Expand arrays into separate columns
message_embedding_data = pd.DataFrame(data['message_embedding'].tolist(), index=data.index).add_prefix('message_embedding_')
recipient_name_embedding_data = pd.DataFrame(data['recipient_name_embedding'].tolist(), index=data.index).add_prefix('recipient_name_embedding_')
sender_name_embedding_data = pd.DataFrame(data['sender_name_embedding'].tolist(), index=data.index).add_prefix('sender_name_embedding_')

# Concatenate embeddings into the original dataframe
data = pd.concat([data, message_embedding_data, recipient_name_embedding_data, sender_name_embedding_data], axis=1)

# Drop the original embedding columns
data = data.drop(columns=['message_embedding', 'recipient_name_embedding', 'sender_name_embedding'])


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 307 entries, transaction_amount to sender_name_embedding_99
dtypes: datetime64[ns](1), float32(300), float64(2), object(4)
memory usage: 12.0+ MB


In [21]:
# Identify columns with object and datetime64[ns] data types
columns_to_drop = data.select_dtypes(include=['object', 'datetime64[ns]']).columns

# Drop the identified columns
data = data.drop(columns=columns_to_drop)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 302 entries, transaction_amount to sender_name_embedding_99
dtypes: float32(300), float64(2)
memory usage: 11.6 MB


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare features and target variable
X = data.drop(columns=['is_scam'])
y = data['is_scam']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


KeyError: "['is_scam'] not found in axis"