In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import json


# Cleaning function to remove unwanted special symbols and delimiters
def clean_text(text):
    # Remove special characters like hyphens, underscores, etc., and replace them with a single space
    text = re.sub(r'[_\-\s]{2,}', ' ', text)  # Clean multiple underscores, hyphens, or spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.strip()

# Load the dataset into a pandas DataFrame
scam_df = pd.read_csv('phishing_email.csv')
# Clean all the text before applying TF-IDF
scam_df['text_combined_cleaned'] = scam_df['text_combined'].apply(clean_text)

# Step 1: Data preprocessing
X = scam_df['text_combined_cleaned'].fillna('')  # Handle missing text data
y = scam_df['label']

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Use TF-IDF to convert text to numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 4: Train a logistic regression classification model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Uncomment below to train a random forest classification model instead
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train_tfidf, y_train)

# Step 5: Evaluate the model
#y_pred = model.predict(X_test_tfidf)
#print(classification_report(y_test, y_pred))


def extract_important_keywords(model, tfidf_vectorizer, top_n):
    # Check if the model is a linear model (e.g., Logistic Regression)
    if hasattr(model, 'coef_'):
        coefficients = model.coef_.flatten()
        # Select only the positive coefficients contributing to the positive class (label=1)
        positive_indices = np.where(coefficients > 0)[0]
        # Sort the positive coefficients in descending order
        sorted_positive_indices = positive_indices[np.argsort(coefficients[positive_indices])[::-1][:top_n]]
        important_keywords = np.array(tfidf_vectorizer.get_feature_names_out())[sorted_positive_indices]
        important_weights = coefficients[sorted_positive_indices]

        # Return a list of tuples containing keywords and their coefficients
        important_keywords_with_weights = list(zip(important_keywords, important_weights))
        
        # Output the important keywords and their coefficients
        #print("Top 40 Important Keywords and their Coefficients:")
        #for keyword, coef in important_keywords_with_weights:
            #print(f"{keyword}: {coef}")
        
        return important_keywords_with_weights

    # Check if the model is a tree-based model (e.g., Random Forest)
    #elif hasattr(model, 'feature_importances_'):
        #feature_importances = model.feature_importances_
        #indices = np.argsort(feature_importances)[::-1][:top_n]
        #important_keywords = np.array(tfidf_vectorizer.get_feature_names_out())[indices]
        #important_weights = feature_importances[indices]

        # Return a list of tuples containing keywords and their importance
        #important_keywords_with_weights = list(zip(important_keywords, important_weights))
        
        # Output the important keywords and their importance
        #print("Top 40 Important Keywords and their Importance:")
        #for keyword, importance in important_keywords_with_weights:
            #print(f"{keyword}: {importance}")
        
        #return important_keywords_with_weights

    # If the model type is not supported
    else:
        print("Model type not supported for keyword extraction.")
        return []

# Call the function to extract important keywords along with their weights
important_keywords_list = extract_important_keywords(model, tfidf_vectorizer, top_n=40)

# Create a dictionary mapping words to categories
word_category_mapping = {
    'josemonkeyorg': 'Online Scam Phrases',
    'cnncom': 'Online Scam Phrases',
    'http': 'Online Scam Phrases',
    'click': 'Online Scam Phrases',
    'remove': 'Online Scam Phrases',
    'choose': 'Online Scam Phrases',
    'site': 'Online Scam Phrases',
    
    'investment': 'Financial Scam Phrases',
    'account': 'Financial Scam Phrases',
    'money': 'Financial Scam Phrases',
    'statements': 'Financial Scam Phrases',
    'payment': 'Financial Scam Phrases',
    'transfer': 'Financial Scam Phrases',
    'approved': 'Financial Scam Phrases',
    'bank': 'Financial Scam Phrases',
    
    'viagra': 'Healthcare Scam Phrases',
    'pills': 'Healthcare Scam Phrases',
    'lose': 'Healthcare Scam Phrases',
    'health': 'Healthcare Scam Phrases',
    
    'guaranteed': 'Counterfeit Product Phrases',
    'replica': 'Counterfeit Product Phrases',
    'custom': 'Counterfeit Product Phrases',
    'huge': 'Counterfeit Product Phrases',
    'watches': 'Counterfeit Product Phrases',
    'quality': 'Counterfeit Product Phrases',
    'rolex': 'Counterfeit Product Phrases',
    'cable': 'Counterfeit Product Phrases',
    
    'love': 'Emotional Manipulation Phrases',
    'professional': 'Emotional Manipulation Phrases',
    'dear': 'Emotional Manipulation Phrases',
    'sex': 'Emotional Manipulation Phrases',
    'life': 'Emotional Manipulation Phrases'
}

# Example descriptions for each category
category_descriptions = {
    "Online Scam Phrases": "Online Scam Phrases are commonly associated with scams that involve phishing websites, suspicious URLs, and prompts to click links. Scammers often trick users into visiting fraudulent sites by embedding links that appear legitimate but lead to malicious content.",
    "Financial Scam Phrases": "Financial Scam Phrases frequently appear in scams that target individuals by impersonating financial institutions or promoting fake investment opportunities. Scammers use these words to exploit trust and deceive victims into giving up money or sensitive banking information.",
    "Healthcare Scam Phrases": "Healthcare Scam Phrases are found in scams promoting counterfeit medications or fake health treatments. Common in email spam, these phrases lure victims with promises of weight loss, performance enhancement, or health benefits.",
    "Counterfeit Product Phrases": "Counterfeit Product Phrases are indicative of scams that involve counterfeit goods, particularly luxury items like watches and electronics. Scammers use enticing words such as 'replica' or 'guaranteed' to promote fake products at attractive prices, often leading to low-quality or fraudulent purchases.",
    "Emotional Manipulation Phrases": "Emotional Manipulation Phrases are found in romance scams and personal appeals. Scammers often use emotionally charged language to create trust or a sense of urgency, manipulating victims into sending money or personal information under false pretenses."
}


scam_categories = {
    "Online Scam Phrases": {"words": {}, "description": category_descriptions["Online Scam Phrases"]},
    "Financial Scam Phrases": {"words": {}, "description": category_descriptions["Financial Scam Phrases"]},
    "Healthcare Scam Phrases": {"words": {}, "description": category_descriptions["Healthcare Scam Phrases"]},
    "Counterfeit Product Phrases": {"words": {}, "description": category_descriptions["Counterfeit Product Phrases"]},
    "Emotional Manipulation Phrases": {"words": {}, "description": category_descriptions["Emotional Manipulation Phrases"]}
}

# Iterate over the important_keywords_list and allocate the words to the correct category
for word, weight in important_keywords_list:
    # Check if the word exists in the word_category_mapping dictionary
    if word in word_category_mapping:
        # Get the category the word belongs to
        category = word_category_mapping[word]
        # Add the word and its rounded weight to the appropriate category in the scam_categories dictionary
        scam_categories[category]["words"][word] = round(weight, 2)

# Save the scam_categories dictionary to a JSON file
with open('wordcould_content.json', 'w') as json_file:
    json.dump(scam_categories, json_file, indent=4)