### PySpark reading a file form a CSV

In [1]:
# import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BigData") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


24/03/18 10:04:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
### Basic - EDA

In [4]:
df_emails = spark.read.format("csv").option("header", "true").load("hdfs://localhost:8020/user1/emails.csv")
df_emails.show()


[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+--------------------+--------------------+
|                file|             message|
+--------------------+--------------------+
|allen-p/_sent_mai...|Message-ID: <1878...|
|           Date: Mon| 14 May 2001 16:3...|
|From: phillip.all...|                null|
|To: tim.belden@en...|                null|
|           Subject: |                null|
|   Mime-Version: 1.0|                null|
|Content-Type: tex...|                null|
|Content-Transfer-...|                null|
|X-From: Phillip K...|                null|
|X-To: Tim Belden ...|                null|
|              X-cc: |                null|
|             X-bcc: |                null|
|X-Folder: \Philli...| Phillip K.\'Sent...|
|   X-Origin: Allen-P|                null|
|X-FileName: palle...|                null|
|Here is our forecast|                null|
|                   "|                null|
|allen-p/_sent_mai...|Message-ID: <1546...|
|           Date: Fri| 4 May 2001 13:51...|
|From: phillip.all...|          

In [5]:
# Describe provides summary statistics of numeric columns in a DataFrame
df_emails.describe().show()




+-------+--------------------+--------+
|summary|                file| message|
+-------+--------------------+--------+
|  count|             8299853| 2508249|
|   mean|                 NaN|Infinity|
| stddev|                 NaN|     NaN|
|    min|                  \t|      \t|
|    max|~~~~~~~~~~~~~~~~~...|       ||
+-------+--------------------+--------+



                                                                                

In [6]:
# Show the first few rows
df_emails.show(n=20)


+--------------------+--------------------+
|                file|             message|
+--------------------+--------------------+
|allen-p/_sent_mai...|Message-ID: <1878...|
|           Date: Mon| 14 May 2001 16:3...|
|From: phillip.all...|                null|
|To: tim.belden@en...|                null|
|           Subject: |                null|
|   Mime-Version: 1.0|                null|
|Content-Type: tex...|                null|
|Content-Transfer-...|                null|
|X-From: Phillip K...|                null|
|X-To: Tim Belden ...|                null|
|              X-cc: |                null|
|             X-bcc: |                null|
|X-Folder: \Philli...| Phillip K.\'Sent...|
|   X-Origin: Allen-P|                null|
|X-FileName: palle...|                null|
|Here is our forecast|                null|
|                   "|                null|
|allen-p/_sent_mai...|Message-ID: <1546...|
|           Date: Fri| 4 May 2001 13:51...|
|From: phillip.all...|          

In [7]:
from pyspark.sql.functions import regexp_extract

# Example regular expression pattern for a date in the format "E, dd MMM yyyy HH:mm:ss Z"
# Adjust this pattern to match the actual format found in your 'message' data
date_pattern = r'\bMon, \d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2} -\d{4} \(PDT\)'

# Create a new column 'ExtractedDate' by extracting the date string from 'message'
df_emails = df_emails.withColumn("ExtractedDate", regexp_extract("message", date_pattern, 0))

# Show the result of extraction
df_emails.select("ExtractedDate").show(truncate=False, n=5)


+-------------+
|ExtractedDate|
+-------------+
|             |
|             |
|null         |
|null         |
|null         |
+-------------+
only showing top 5 rows



In [8]:
##Preprocessing data

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace

# Assuming SparkSession has already been created
# spark = SparkSession.builder.appName("EmailsAnalysis").getOrCreate()

# Make sure df_emails is properly defined here
# df_emails = spark.read.format("csv").option("header", "true").load("hdfs://localhost:8020/user1/emails.csv")

# Convert the text to lower case
df_emails = df_emails.withColumn("message_clean", lower(col("message")))

# Remove email metadata, non-letter characters, and extra spaces
df_emails = df_emails.withColumn("message_clean", regexp_replace("message_clean", "^(From:|To:|Subject:|Mime-Version:|Content-Type:|Content-Transfer-Encoding:|X-From:|X-To:|X-cc:|X-bcc:|X-Folder:|X-Origin:|X-FileName:).*", ""))
df_emails = df_emails.withColumn("message_clean", regexp_replace("message_clean", "[^a-zA-Z\\s]", ""))
df_emails = df_emails.withColumn("message_clean", regexp_replace("message_clean", "\s+", " "))

# Show the cleaned text
df_emails.select("message_clean").show(truncate=False, n=5)


+----------------------------+
|message_clean               |
+----------------------------+
|messageid javamailevansthyme|
| may pdt                    |
|null                        |
|null                        |
|null                        |
+----------------------------+
only showing top 5 rows



In [10]:
### Step 3: Feature Engineering and Vectorization

In [11]:
from pyspark.sql.functions import col, lower, regexp_replace, to_timestamp
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml import Pipeline

# Ensure df_emails is correctly defined and available at this point in your code

# Filter out rows where 'message_clean' is null or an empty string
df_emails_filtered = df_emails.filter(col("message_clean").isNotNull() & (col("message_clean") != ""))

# Define the stages of the pipeline
tokenizer = Tokenizer(inputCol="message_clean", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, cv, idf])

# Apply the pipeline to the filtered DataFrame
model = pipeline.fit(df_emails_filtered)
result = model.transform(df_emails_filtered)

# Show the result
result.select("features").show(truncate=False)


24/03/18 10:05:20 WARN DAGScheduler: Broadcasting large task binary with size 1803.5 KiB
                                                                                

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                                                            

24/03/18 10:05:47 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB


##### Your model successfully transformed the textual data into numerical vectors that can be used for machine learning purposes, including neural network models for detecting suspicious messages. The warning about broadcasting a large task binary size is an indication of the data size being processed but is generally not a concern unless it leads to performance issues.

## Step 4: Designing the Neural Network which come  Before moving on to training a neural network model, we'll need to prepare your dataset further, including splitting it into training and test sets, and potentially normalizing the feature vectors 

In [12]:
# Split the data
(train_data, test_data) = result.randomSplit([0.8, 0.2], seed=42)


In [13]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

# Example UDF to label emails based on the presence of a "suspicious keyword"
def label_email(content):
    if content is not None and "cash" in content:
        return 1
    else:
        return 0

# Register the UDF
label_udf = udf(label_email, IntegerType())

# Assuming 'result' is your DataFrame and 'message_clean' is the column containing the cleaned email text
# Apply the UDF to create a new column 'label'
result = result.withColumn('label', label_udf(col('message_clean')))

# Now proceed with data preparation steps such as splitting the dataset into training and test sets
(train_data, test_data) = result.randomSplit([0.8, 0.2], seed=42)

# Ensure your model training code below is correctly referring to 'features' and 'label' columns
# For example:
from pyspark.ml.classification import LogisticRegression

# Initialize the classifier, assuming 'features' column contains vectorized features
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Train the model on the training data
lrModel = lr.fit(train_data)

# Make predictions on the test data
predictions = lrModel.transform(test_data)

# Evaluate the model if necessary
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

print(f"Test AUC: {auc}")


24/03/18 10:05:48 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:06:50 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:51 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:51 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:52 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:52 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:53 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:53 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:53 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:54 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:54 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/18 10:07:55 WARN DAGScheduler: Broadcasting larg

Test AUC: 0.9982872399017843


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Placeholder: Load your data here
# For example, let's assume you've loaded and prepared your datasets into these variables
# X_train, X_test, y_train, y_test = load_and_preprocess_your_data()

# Example placeholder data - replace with your actual data
X_train = np.random.randint(0, 10000, (1000, 100))  # Random data for illustration
y_train = np.random.randint(0, 2, (1000, ))  # Random binary labels
X_test = np.random.randint(0, 10000, (200, 100))  # Random data for illustration
y_test = np.random.randint(0, 2, (200, ))  # Random binary labels

# Define your LSTM model architecture
model = Sequential([
    Embedding(input_dim=10000,  # Size of your vocabulary
              output_dim=128,  # Dimension of the dense embedding
              input_length=100),  # Length of input sequences
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Assuming binary classification
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# Save the model for later use
model.save('path_to_my_model.h5')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.5199999809265137


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop
import numpy as np

# Placeholder: Load your data here

# Example placeholder data - replace with your actual data
X_train = np.random.randint(0, 10000, (1000, 100))  # Random data for illustration
y_train = np.random.randint(0, 2, (1000, ))  # Random binary labels
X_test = np.random.randint(0, 10000, (200, 100))  # Random data for illustration
y_test = np.random.randint(0, 2, (200, ))  # Random binary labels

# Define your LSTM model architecture
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.2),  # Increased complexity and added dropout
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Using RMSprop optimizer and setting a learning rate
optimizer = RMSprop(lr=0.001)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Train the model with early stopping
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# Save the model for later use
model.save('path_to_my_model.h5')




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.4650000035762787


In [21]:
from tensorflow.keras.models import load_model

model = load_model('path_to_my_model.h5')


In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pandas as pd

# Assuming you have a loaded Keras Tokenizer and your model
# tokenizer = ...
model = load_model('path_to_my_model.h5')

# Convert Spark DataFrame to Pandas DataFrame for Keras compatibility (if starting from Spark)
df_emails_pd = df_emails.toPandas()

# Tokenize and pad the email texts (now using Keras Tokenizer)
sequences = tokenizer.texts_to_sequences(df_emails_pd['message_clean'])
data = pad_sequences(sequences, maxlen=max_length)  # Ensure `max_length` matches your training data's sequence length

# Predict with the model
predictions = model.predict(data)

# Interpret predictions and add to DataFrame
df_emails_pd['Classification'] = ['Suspicious email detected!' if pred > 0.5 else 'Normal conversation.' for pred in predictions.flatten()]

# If you need to work with Spark DataFrames afterward, you can convert back
df_emails_updated = spark.createDataFrame(df_emails_pd)


                                                                                

Py4JJavaError: An error occurred while calling o70.collectToPython.
: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at org.apache.spark.sql.execution.SparkPlan$$anon$1._next(SparkPlan.scala:415)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.getNext(SparkPlan.scala:426)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.getNext(SparkPlan.scala:412)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.util.NextIterator.foreach(NextIterator.scala:21)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1(SparkPlan.scala:449)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1$adapted(SparkPlan.scala:448)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$4438/1657540594.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4036)
	at org.apache.spark.sql.Dataset$$Lambda$4436/1774544996.apply(Unknown Source)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4206)
	at org.apache.spark.sql.Dataset$$Lambda$1997/1826484949.apply(Unknown Source)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4204)
	at org.apache.spark.sql.Dataset$$Lambda$1658/1223871856.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1672/529568201.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1659/664329173.apply(Unknown Source)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4204)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4033)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)


In [None]:
# sc master - running locally
sc.master

In [None]:
# The inferred schema can be visualized using the printSchema() method
df.printSchema()

In [None]:
def parse_message(message):
    # Initialize a dictionary to hold the parsed data
    parsed_data = {
        "MessageID": "",
        "Date": "",
        "From": "",
        "To": "",
        "Subject": ""
    }
    
    # Split the message into lines for processing
    lines = message.split("\n")
    for line in lines:
        if line.startswith("Message-ID:"):
            parsed_data["MessageID"] = line[len("Message-ID:"):].strip()
        elif line.startswith("Date:"):
            parsed_data["Date"] = line[len("Date:"):].strip()
        elif line.startswith("From:"):
            parsed_data["From"] = line[len("From:"):].strip()
        elif line.startswith("To:"):
            parsed_data["To"] = line[len("To:"):].strip()
        elif line.startswith("Subject:"):
            parsed_data["Subject"] = line[len("Subject:"):].strip()
        # Add more conditions as needed for other fields
    
    return parsed_data


In [None]:
def show_first_10_messages(df):
    # Assuming 'df' is your DataFrame and it has a column named 'message'
    # This will show the first 10 rows of the 'message' column
    df.select("message").show(10, truncate=False)

# Call the function with your DataFrame
show_first_10_messages(df)


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType

# Define your schema as before
schema = StructType([
    StructField("MessageID", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("From", StringType(), True),
    StructField("To", StringType(), True),
    StructField("Subject", StringType(), True),
    # Add other fields as necessary
])

def parse_message(message):
    # Initialize the dictionary with default empty strings
    parsed_data = {
        "MessageID": "",
        "Date": "",
        "From": "",
        "To": "",
        "Subject": ""
    }
    
    # Proceed only if message is not None and is a string
    if message and isinstance(message, str):
        lines = message.split('\n')
        for line in lines:
            if line.startswith("Message-ID:"):
                parsed_data["MessageID"] = line.split(":", 1)[1].strip()
            elif line.startswith("Date:"):
                parsed_data["Date"] = line.split(":", 1)[1].strip()
            elif line.startswith("From:"):
                parsed_data["From"] = line.split(":", 1)[1].strip()
            elif line.startswith("To:"):
                parsed_data["To"] = line.split(":", 1)[1].strip()
            elif line.startswith("Subject:"):
                parsed_data["Subject"] = line.split(":", 1)[1].strip()
            # Continue with other headers as needed
    
    return parsed_data

# Register the UDF with the modified parse_message function
parse_message_udf = udf(parse_message, schema)

# Apply the UDF to your DataFrame as before
df_parsed = df.withColumn("parsed_message", parse_message_udf(df["message"]))


In [None]:
def show_specific_message(df, index):
    """
    Displays a specific message from the DataFrame based on the provided index.

    Parameters:
    - df: The Spark DataFrame containing the messages.
    - index: The index (row number) of the message to display.
    """
    # Ensure the DataFrame has a column named 'message'
    if 'message' in df.columns:
        # Collect the row of interest into a list
        message_row = df.select("message").collect()[index]
        
        # Extract the message from the row and print it
        message_content = message_row["message"]
        print(f"Message at index {index}:\n{message_content}")
    else:
        print("The DataFrame does not contain a column named 'message'.")

# Example usage:
# Assuming 'df' is your DataFrame and you want to see the first message
show_specific_message(df, 0)


In [None]:
import pandas as pd
import csv
import re

# Load the dataset with the correct parameters for handling potential parsing issues
df_emails = pd.read_csv('emails.csv', quoting=csv.QUOTE_NONE, on_bad_lines='skip', escapechar="\\")

# Removing quotes from column names if they exist
df_emails.columns = df_emails.columns.str.replace('"', '')

# Handling Missing Values
df_emails.fillna('', inplace=True)

# Text Preprocessing Function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove email headers or unnecessary metadata (for demonstration, might need customization)
    text = re.sub(r'^[a-z]+:.*$', '', text)  # Remove lines that start with metadata-like patterns
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    # Remove special characters (customize based on the dataset and needs)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

# Apply text preprocessing to the 'message' column
df_emails['message'] = df_emails['message'].apply(preprocess_text)

# Display the first few rows of the cleaned dataframe
df_emails.head(15)


In [None]:
import pandas as pd

# Mock example of a structured dataframe
data = {
    'body': ['This is the first email content.', 'Here is another email, potentially suspicious.', 'This email is safe and informative.']
}
emails_structured_df = pd.DataFrame(data)

# Assuming the vectorization function from the previous message, apply it here:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_texts(texts):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        max_features=10000,
        min_df=1,
        max_df=0.9
    )
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

# Apply vectorization to the 'body' column of emails_structured_df
texts = emails_structured_df['body'].tolist()
X, vectorizer = vectorize_texts(texts)

print("Vectorization Complete. Shape of X:", X.shape)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB

def evaluate_model(X, y):
    """
    Splits the data into training and testing sets, trains a model, and evaluates its performance.
    
    Parameters:
    X (sparse matrix): The feature matrix obtained from vectorizing the text data.
    y (array-like): The target labels indicating the class of each document.
    
    Returns:
    A dictionary containing the model's performance metrics: accuracy, precision, recall, and F1 score.
    """
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Initialize and train the model
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    # Make predictions on the testing set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1_score': f1_score(y_test, y_pred, average='weighted')
    }
    
    return metrics

# Example usage
# Assume y is your array of labels for the dataset, with 1 indicating suspicious and 0 indicating not suspicious
# y = [1, 0, 1, ...]  # This should be the actual labels for your dataset
# metrics = evaluate_model(X, y)
# print(metrics)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def create_model(input_dim):
    # Create a Sequential model
    model = Sequential()
    # Add layers to the model
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid')) # Use 'sigmoid' for binary classification
    
    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'Precision', 'Recall'])
    return model


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Mock data for demonstration purposes
# Replace these with your actual vectorized data (X) and labels (y)
X = np.random.rand(100, 20)  # Example feature matrix with 100 samples and 20 features
y = np.random.randint(2, size=100)  # Example binary labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Sigmoid activation for binary classification
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',  # Use binary_crossentropy for binary classification
                  metrics=['accuracy', 'Precision', 'Recall'])
    return model


In [None]:
# Now that X_train is defined, we can proceed to use it
input_dim = X_train.shape[1]  # Number of features from the vectorized data
model = create_model(input_dim)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))


In [None]:
# Example new data
new_data = ["This is a new email conversation.", "Another suspicious email detected!", "Normal conversation."]

# Preprocess and vectorize new data
new_data_processed = [preprocess_text(text) for text in new_data]  # Using the same preprocess_text function from before
new_data_vectorized = vectorizer.transform(new_data_processed)  # Use the same vectorizer fitted on the training data


In [None]:
def prepare_and_predict(new_data, vectorizer, model):
    # Preprocess new data
    new_data_processed = [preprocess_text(text) for text in new_data]
    
    # Vectorize new data using the same vectorizer instance used for training
    new_data_vectorized = vectorizer.transform(new_data_processed)
    
    # IMPORTANT: Artificially ensure the shape matches the expected input of the model
    # This step is hypothetical and serves to illustrate the concept
    # In practice, ensure your data vectorization matches the training phase accurately
    if new_data_vectorized.shape[1] < 20:
        # Assuming the missing features can be set to 0 (this is a strong assumption and may not be valid)
        additional_zeros = np.zeros((new_data_vectorized.shape[0], 20 - new_data_vectorized.shape[1]))
        new_data_vectorized = np.hstack((new_data_vectorized.toarray(), additional_zeros))
    
    predictions = model.predict(new_data_vectorized)
    predicted_probabilities = predictions.flatten()
    return predicted_probabilities

# Example usage
new_data = ["This is a new email conversation.", "Another suspicious email detected!", "Normal conversation."]
predicted_probabilities = prepare_and_predict(new_data, vectorizer, model)
print(predicted_probabilities)


In [None]:
import matplotlib.pyplot as plt

def visualize_predictions(emails, probabilities):
    """
    Visualizes the predicted probabilities of emails being suspicious or fraudulent.

    Parameters:
    - emails: A list of email texts or subjects being analyzed.
    - probabilities: A list of probabilities corresponding to the likelihood of each email being suspicious.
    """
    # Ensure the lists have the same length
    assert len(emails) == len(probabilities), "Emails and probabilities lists must have the same length."

    # Creating the bar plot
    plt.figure(figsize=(10, 6))
    plt.barh(emails, probabilities, color='skyblue')
    plt.xlabel('Probability of Being Suspicious')
    plt.title('Predicted Probabilities of Emails Being Suspicious or Fraudulent')
    for index, value in enumerate(probabilities):
        plt.text(value, index, f"{value:.2f}")
    plt.xlim(0, 1)  # Assuming probabilities range from 0 to 1
    plt.show()

# Example usage:
emails = ["New email conversation", "Suspicious email detected", "Normal conversation"]
probabilities = [0.34725702, 0.08264993, 0.34725702]
visualize_predictions(emails, probabilities)


In [None]:
def visualize_fraud_predictions(email_texts, predictions):
    """
    Visualize the emails with their predicted fraud probabilities.

    Parameters:
    - email_texts: List of email text content.
    - predictions: List of predicted probabilities corresponding to the fraud likelihood of each email.

    The function doesn't return anything but prints each email with its fraud prediction.
    """
    for email, probability in zip(email_texts, predictions):
        print("Email Content:\n", email)
        print("Fraud Likelihood: {:.2%}".format(probability))
        print("-" * 100)

# Example usage
email_texts = [
    "This is a new email conversation.",
    "Another suspicious email detected!",
    "Normal conversation."
]
predicted_probabilities = [0.34725702, 0.08264993, 0.34725702]

visualize_fraud_predictions(email_texts, predicted_probabilities)


In [None]:
def filter_emails_by_similarity_and_likelihood(emails, similarity_threshold=0.5, likelihood_threshold=8.0):
    """
    Filters emails based on content similarity to a given phrase and a likelihood threshold.
    
    Parameters:
    - emails: List of dictionaries, where each dictionary contains 'content' and 'likelihood' keys.
    - similarity_threshold: A threshold for determining content similarity (not used in this simple example).
    - likelihood_threshold: The minimum likelihood score for an email to be considered suspicious.
    
    Returns:
    - A list of emails considered suspicious based on the likelihood threshold.
    """
    suspicious_phrase = "Another suspicious email detected!"
    filtered_emails = [email for email in emails if suspicious_phrase in email['content'] and email['likelihood'] >= likelihood_threshold]
    return filtered_emails

# Example usage:
emails = [
    {'content': "This is a normal email content.", 'likelihood': 2.0},
    {'content': "Another suspicious email detected! Please check it out.", 'likelihood': 8.26},
    {'content': "Another suspicious email detected! This seems like a scam.", 'likelihood': 9.5},
    {'content': "This is another normal conversation.", 'likelihood': 3.2}
]

# Filtering emails:
suspicious_emails = filter_emails_by_similarity_and_likelihood(emails, likelihood_threshold=8.0)

# Displaying the filtered, suspicious emails:
for email in suspicious_emails:
    print(f"Email Content: {email['content']}")
    print(f"Fraud Likelihood: {email['likelihood']}%")
    print("-"*80)
