# Scam Detection Machine Learning Model

In [12]:
import pandas as pd
import numpy as np
import random
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler

### Generating a Toy Dataset for Scams

In [13]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Sample data generation
np.random.seed(42)

# Generating 1000 records
num_records = 1000
names = ['John', 'Harry', 'Alice', 'Bob', 'Charlie', 'David']
transactions = []

# Generate timestamps
start_date = datetime.now() - timedelta(days=365)
timestamps = [start_date + timedelta(seconds=int(x)) for x in np.random.randint(0, 365*24*3600, num_records)]

# Initialize a dictionary to keep track of transaction frequency per user
transaction_count = {name: 0 for name in names}

for i in range(num_records):
    sender = np.random.choice(names)
    recipient = np.random.choice(names)
    while recipient == sender:
        recipient = np.random.choice(names)
    
    amount = np.random.randint(100, 10000)
    transaction_frequency = transaction_count[sender] + 1  # Increment transaction frequency for sender
    is_scam = 0

    # More realistic scam flagging
    if sender in ['John', 'Harry']:
        if sender == 'John' and transaction_frequency == 7 and recipient in ['Harry', 'Alice']:
            is_scam = 1
        elif sender == 'Harry' and transaction_frequency == 7 and recipient in ['John', 'Bob']:
            is_scam = 1
    else:
        if transaction_frequency > 10 and amount > 5000:
            is_scam = 1

    transactions.append([timestamps[i], sender, recipient, amount, transaction_frequency, is_scam])
    
    # Update transaction count
    transaction_count[sender] += 1

# Create a DataFrame
columns = ['Timestamp', 'Sender', 'Recipient', 'Amount', 'Transaction_Frequency', 'Is_Scam']
df = pd.DataFrame(transactions, columns=columns)

# Feature engineering on timestamp
df['Day'] = df['Timestamp'].dt.day
df['Month'] = df['Timestamp'].dt.month
df['Year'] = df['Timestamp'].dt.year
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second

# Drop the Timestamp column
df = df.drop(columns=['Timestamp'])
df.head()

Unnamed: 0,Sender,Recipient,Amount,Transaction_Frequency,Is_Scam,Day,Month,Year,Hour,Minute,Second
0,Charlie,David,1560,1,0,25,2,2024,14,6,10
1,Charlie,Harry,4278,2,0,5,12,2023,8,10,44
2,John,Charlie,7345,1,0,27,2,2024,1,26,56
3,John,David,2148,2,0,7,4,2024,22,12,13
4,Alice,Charlie,816,1,0,1,2,2024,1,32,34


In [24]:
data = df

In [25]:
data.head()

Unnamed: 0,transaction_amount,transaction_datetime,is_scam_prob,scam_possibility,recipient_name,sender_name,message
0,951.21,2023-01-03 15:20:38,0.37454,Low,Michele Davenport,Victoria Bennett,Thank you for your recent transaction. Your ac...
1,108.98,2023-01-03 15:52:35,0.445833,Low,Courtney Alvarez,Kara Morrow,Thank you for your recent transaction. Your ac...
2,654.38,2023-01-05 09:23:43,0.142867,Low,Anthony Hardin,William Reed,Thank you for your recent transaction. Your ac...
3,53.85,2023-01-04 12:32:11,0.938553,Highest,Joanna Shepherd,Thomas West,Center meeting million machine. Urgent action ...
4,84.98,2023-01-01 08:26:58,0.611653,Moderate,Justin Guerra,Micheal Harris,Case meet improve know dog cost down. Urgent a...


In [17]:
import gensim
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing to the text columns
data['message_tokens'] = data['message'].apply(preprocess_text)
data['recipient_name_tokens'] = data['recipient_name'].apply(preprocess_text)
data['sender_name_tokens'] = data['sender_name'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# Train a Word2Vec model on the tokens
all_tokens = data['message_tokens'].tolist() + data['recipient_name_tokens'].tolist() + data['sender_name_tokens'].tolist()
word2vec_model = gensim.models.Word2Vec(all_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Function to get the embedding for a text
def get_text_embedding(tokens, model):
    embedding = np.mean([model.wv[word] for word in tokens if word in model.wv], axis=0)
    if isinstance(embedding, np.ndarray):
        return embedding
    else:
        return np.zeros(model.vector_size)

# Get embeddings for the text columns
data['message_embedding'] = data['message_tokens'].apply(lambda x: get_text_embedding(x, word2vec_model).tolist())
data['recipient_name_embedding'] = data['recipient_name_tokens'].apply(lambda x: get_text_embedding(x, word2vec_model).tolist())
data['sender_name_embedding'] = data['sender_name_tokens'].apply(lambda x: get_text_embedding(x, word2vec_model).tolist())

# Drop the token columns
data = data.drop(columns=['message_tokens', 'recipient_name_tokens', 'sender_name_tokens'])


In [19]:
# Convert embedding columns from lists to arrays of floats
data['message_embedding'] = data['message_embedding'].apply(lambda x: np.array(x, dtype=np.float32))
data['recipient_name_embedding'] = data['recipient_name_embedding'].apply(lambda x: np.array(x, dtype=np.float32))
data['sender_name_embedding'] = data['sender_name_embedding'].apply(lambda x: np.array(x, dtype=np.float32))

# Expand arrays into separate columns
message_embedding_data = pd.DataFrame(data['message_embedding'].tolist(), index=data.index).add_prefix('message_embedding_')
recipient_name_embedding_data = pd.DataFrame(data['recipient_name_embedding'].tolist(), index=data.index).add_prefix('recipient_name_embedding_')
sender_name_embedding_data = pd.DataFrame(data['sender_name_embedding'].tolist(), index=data.index).add_prefix('sender_name_embedding_')

# Concatenate embeddings into the original dataframe
data = pd.concat([data, message_embedding_data, recipient_name_embedding_data, sender_name_embedding_data], axis=1)

# Drop the original embedding columns
data = data.drop(columns=['message_embedding', 'recipient_name_embedding', 'sender_name_embedding'])


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 307 entries, transaction_amount to sender_name_embedding_99
dtypes: datetime64[ns](1), float32(300), float64(2), object(4)
memory usage: 12.0+ MB


In [21]:
# Identify columns with object and datetime64[ns] data types
columns_to_drop = data.select_dtypes(include=['object', 'datetime64[ns]']).columns

# Drop the identified columns
data = data.drop(columns=columns_to_drop)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 302 entries, transaction_amount to sender_name_embedding_99
dtypes: float32(300), float64(2)
memory usage: 11.6 MB


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare features and target variable
X = data.drop(columns=['is_scam'])
y = data['is_scam']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


KeyError: "['is_scam'] not found in axis"