### PySpark reading a file form a CSV

In [1]:
# import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("EmailSuspicionDetection").getOrCreate()


24/03/22 09:20:21 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:

# Assuming the file is in CSV format and has a header
df = spark.read.csv("hdfs:///user1/data_batch_00", header=True, inferSchema=True)
df.show()


                                                                                

+--------------------+--------------------+
|                file|             message|
+--------------------+--------------------+
|allen-p/_sent_mai...|Message-ID: <1878...|
|           Date: Mon| 14 May 2001 16:3...|
|From: phillip.all...|                null|
|To: tim.belden@en...|                null|
|           Subject: |                null|
|   Mime-Version: 1.0|                null|
|Content-Type: tex...|                null|
|Content-Transfer-...|                null|
|X-From: Phillip K...|                null|
|X-To: Tim Belden ...|                null|
|              X-cc: |                null|
|             X-bcc: |                null|
|X-Folder: \Philli...| Phillip K.\'Sent...|
|   X-Origin: Allen-P|                null|
|X-FileName: palle...|                null|
|Here is our forecast|                null|
|                   "|                null|
|allen-p/_sent_mai...|Message-ID: <1546...|
|           Date: Fri| 4 May 2001 13:51...|
|From: phillip.all...|          

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF

spark = SparkSession.builder.appName("EmailSuspicionDetection").getOrCreate()

# Assuming the file is in CSV format and has a header
df = spark.read.csv("hdfs:///user1/data_batch_00", header=True, inferSchema=True)

# Filtering out rows where the message column is null
df = df.filter(col("message").isNotNull())

# Tokenization
tokenizer = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W")
df_tokenized = tokenizer.transform(df)

# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_cleaned = remover.transform(df_tokenized)

# Hashing term frequency
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(df_cleaned)

# Inverse document frequency
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


                                                                                

In [19]:
# Convert the specific column of the DataFrame to a list
email_texts = df.select("message").rdd.flatMap(lambda x: x).collect()


                                                                                

In [21]:
tokenizer = Tokenizer(num_words=10000)  # This will keep the top 10,000 most frequent words
tokenizer.fit_on_texts(email_texts)

sequences = tokenizer.texts_to_sequences(email_texts)
X = pad_sequences(sequences, maxlen=100)


In [23]:
import numpy as np
from pyspark.sql import DataFrame

def prepare_labels(df: DataFrame, label_column_name: str) -> np.ndarray:
    """
    Extracts labels from a PySpark DataFrame and converts them into a NumPy array for model training.
    
    Parameters:
    - df: PySpark DataFrame containing the dataset.
    - label_column_name: The name of the column in df that contains the binary labels.
    
    Returns:
    - labels: A NumPy array of labels.
    """
    # Extract labels from the specified column and convert to a list
    labels_list = df.select(label_column_name).rdd.flatMap(lambda x: x).collect()
    
    # Convert the list of labels to a NumPy array
    labels = np.array(labels_list)
    
    return labels


In [26]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Email Classification") \
    .getOrCreate()

# Load your data from the file into a DataFrame
# Replace "/path/to/data_batch_00" with the actual HDFS path where your file is stored
email_df = spark.read.csv("hdfs:///user1/data_batch_00", header=True, inferSchema=True)


                                                                                

In [28]:
from pyspark.sql.functions import lit

# Add a dummy 'is_suspicious' column with all values set to 0
email_df = email_df.withColumn("is_suspicious", lit(0))

# Now, you can call the prepare_labels function as before
labels = prepare_labels(email_df, "is_suspicious")


                                                                                

In [30]:
import numpy as np
from pyspark.sql import DataFrame

def prepare_labels(df: DataFrame, label_column_name: str) -> np.ndarray:
    """
    Extracts labels from a PySpark DataFrame and converts them into a NumPy array for model training.
    
    Parameters:
    - df: PySpark DataFrame containing the dataset.
    - label_column_name: The name of the column in df that contains the binary labels.
    
    Returns:
    - labels: A NumPy array of labels.
    """
    # Extract labels from the specified column and convert to a list
    labels_list = df.select(label_column_name).rdd.flatMap(lambda x: x).collect()
    
    # Convert the list of labels to a NumPy array
    labels = np.array(labels_list)
    
    return labels

# Assuming your DataFrame `email_df` has a binary label column named "is_suspicious"
labels = prepare_labels(email_df, "is_suspicious")


                                                                                

In [32]:
print(len(X), len(labels))


773905 2649686


In [35]:
sequences = tokenizer.texts_to_sequences(email_texts)
X = pad_sequences(sequences, maxlen=100)


In [38]:
# Assuming df is your DataFrame and it's already been filtered/processed as needed

# Process df to create X (this should be your existing logic for tokenization and padding)
# [Place your logic here for creating X]

# Ensure labels are extracted from the same filtered/processed DataFrame
labels = prepare_labels(df, "your_label_column")


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `your_label_column` cannot be resolved. Did you mean one of the following? [`file`, `message`].;
'Project ['your_label_column]
+- Filter isnotnull(message#179)
   +- Relation [file#178,message#179] csv


In [36]:
from sklearn.model_selection import train_test_split

# Assuming X is your feature matrix and labels is the numpy array you prepared
X_train, X_val, y_train, y_val = train_test_split(X, labels, test_size=0.2, random_state=42)


ValueError: Found input variables with inconsistent numbers of samples: [773905, 2649686]

In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Example preprocessing steps
# Load your text data into `texts` and corresponding labels into `labels`
# For demonstration, let's assume `texts` is a list of email bodies and `labels` is a binary list indicating suspicious (1) or not (0)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=100)

# Define the CNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Conv1D(filters=32, kernel_size=7, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data, np.array(labels), test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)


NameError: name 'texts' is not defined

In [13]:
from pyspark.ml.feature import HashingTF, IDF

# Hashing term frequency
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(df_cleaned)

# Inverse document frequency
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


                                                                                

In [16]:
# Example of adding a binary label column, this is just a placeholder.
# You need to replace it with your logic for identifying suspicious emails.
from pyspark.sql.functions import expr

df_with_labels = df.withColumn("label", expr("CASE WHEN <your_condition_here> THEN 1 ELSE 0 END"))


ParseException: 
[PARSE_SYNTAX_ERROR] Syntax error at or near '1'.(line 1, pos 37)

== SQL ==
CASE WHEN <your_condition_here> THEN 1 ELSE 0 END
-------------------------------------^^^


In [15]:
from pyspark.sql.functions import col

# Assuming `is_suspicious` is your column indicating suspicious (1) or not (0)
# Rename this column to `label` for use in ML models
df_labeled = rescaledData.withColumnRenamed("is_suspicious", "label")

# Proceed with your existing train-test split and model training
splits = df_labeled.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

layers = [10000, 500, 100, 2]

trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
model = trainer.fit(train)

result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))


IllegalArgumentException: label does not exist. Available: file, message, words, filtered_words, rawFeatures, features

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BigData") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate()


24/03/22 08:06:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
### Basic - EDA

In [4]:
df_emails = spark.read.format("csv").option("header", "true").load("hdfs://localhost:8020/user1/emails.csv")
df_emails.show()


                                                                                

+--------------------+--------------------+
|                file|             message|
+--------------------+--------------------+
|allen-p/_sent_mai...|Message-ID: <1878...|
|           Date: Mon| 14 May 2001 16:3...|
|From: phillip.all...|                null|
|To: tim.belden@en...|                null|
|           Subject: |                null|
|   Mime-Version: 1.0|                null|
|Content-Type: tex...|                null|
|Content-Transfer-...|                null|
|X-From: Phillip K...|                null|
|X-To: Tim Belden ...|                null|
|              X-cc: |                null|
|             X-bcc: |                null|
|X-Folder: \Philli...| Phillip K.\'Sent...|
|   X-Origin: Allen-P|                null|
|X-FileName: palle...|                null|
|Here is our forecast|                null|
|                   "|                null|
|allen-p/_sent_mai...|Message-ID: <1546...|
|           Date: Fri| 4 May 2001 13:51...|
|From: phillip.all...|          

In [5]:
# Describe provides summary statistics of numeric columns in a DataFrame
df_emails.describe().show()




+-------+--------------------+--------+
|summary|                file| message|
+-------+--------------------+--------+
|  count|             8299853| 2508249|
|   mean|                 NaN|Infinity|
| stddev|                 NaN|     NaN|
|    min|                  \t|      \t|
|    max|~~~~~~~~~~~~~~~~~...|       ||
+-------+--------------------+--------+



                                                                                

In [6]:
# Show the first few rows
df_emails.show(n=20)


+--------------------+--------------------+
|                file|             message|
+--------------------+--------------------+
|allen-p/_sent_mai...|Message-ID: <1878...|
|           Date: Mon| 14 May 2001 16:3...|
|From: phillip.all...|                null|
|To: tim.belden@en...|                null|
|           Subject: |                null|
|   Mime-Version: 1.0|                null|
|Content-Type: tex...|                null|
|Content-Transfer-...|                null|
|X-From: Phillip K...|                null|
|X-To: Tim Belden ...|                null|
|              X-cc: |                null|
|             X-bcc: |                null|
|X-Folder: \Philli...| Phillip K.\'Sent...|
|   X-Origin: Allen-P|                null|
|X-FileName: palle...|                null|
|Here is our forecast|                null|
|                   "|                null|
|allen-p/_sent_mai...|Message-ID: <1546...|
|           Date: Fri| 4 May 2001 13:51...|
|From: phillip.all...|          

In [7]:
from pyspark.sql.functions import regexp_extract

# Example regular expression pattern for a date in the format "E, dd MMM yyyy HH:mm:ss Z"
# Adjust this pattern to match the actual format found in your 'message' data
date_pattern = r'\bMon, \d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2} -\d{4} \(PDT\)'

# Create a new column 'ExtractedDate' by extracting the date string from 'message'
df_emails = df_emails.withColumn("ExtractedDate", regexp_extract("message", date_pattern, 0))

# Show the result of extraction
df_emails.select("ExtractedDate").show(truncate=False, n=5)


+-------------+
|ExtractedDate|
+-------------+
|             |
|             |
|null         |
|null         |
|null         |
+-------------+
only showing top 5 rows



In [8]:
##Preprocessing data

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace

# Assuming SparkSession has already been created
# spark = SparkSession.builder.appName("EmailsAnalysis").getOrCreate()

# Make sure df_emails is properly defined here
# df_emails = spark.read.format("csv").option("header", "true").load("hdfs://localhost:8020/user1/emails.csv")

# Convert the text to lower case
df_emails = df_emails.withColumn("message_clean", lower(col("message")))

# Remove email metadata, non-letter characters, and extra spaces
df_emails = df_emails.withColumn("message_clean", regexp_replace("message_clean", "^(From:|To:|Subject:|Mime-Version:|Content-Type:|Content-Transfer-Encoding:|X-From:|X-To:|X-cc:|X-bcc:|X-Folder:|X-Origin:|X-FileName:).*", ""))
df_emails = df_emails.withColumn("message_clean", regexp_replace("message_clean", "[^a-zA-Z\\s]", ""))
df_emails = df_emails.withColumn("message_clean", regexp_replace("message_clean", "\s+", " "))

# Show the cleaned text
df_emails.select("message_clean").show(truncate=False, n=5)


+----------------------------+
|message_clean               |
+----------------------------+
|messageid javamailevansthyme|
| may pdt                    |
|null                        |
|null                        |
|null                        |
+----------------------------+
only showing top 5 rows



In [10]:
### Step 3: Feature Engineering and Vectorization

In [11]:
from pyspark.sql.functions import col, lower, regexp_replace, to_timestamp
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml import Pipeline

# Ensure df_emails is correctly defined and available at this point in your code

# Filter out rows where 'message_clean' is null or an empty string
df_emails_filtered = df_emails.filter(col("message_clean").isNotNull() & (col("message_clean") != ""))

# Define the stages of the pipeline
tokenizer = Tokenizer(inputCol="message_clean", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, cv, idf])

# Apply the pipeline to the filtered DataFrame
model = pipeline.fit(df_emails_filtered)
result = model.transform(df_emails_filtered)

# Show the result
result.select("features").show(truncate=False)


24/03/22 08:07:47 WARN DAGScheduler: Broadcasting large task binary with size 1803.5 KiB
                                                                                

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                                                            

24/03/22 08:08:18 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB


##### Your model successfully transformed the textual data into numerical vectors that can be used for machine learning purposes, including neural network models for detecting suspicious messages. The warning about broadcasting a large task binary size is an indication of the data size being processed but is generally not a concern unless it leads to performance issues.

## Step 4: Designing the Neural Network which come  Before moving on to training a neural network model, we'll need to prepare your dataset further, including splitting it into training and test sets, and potentially normalizing the feature vectors 

In [12]:
# Split the data
(train_data, test_data) = result.randomSplit([0.8, 0.2], seed=42)


In [13]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

# Example UDF to label emails based on the presence of a "suspicious keyword"
def label_email(content):
    if content is not None and "cash" in content:
        return 1
    else:
        return 0

# Register the UDF
label_udf = udf(label_email, IntegerType())

# Assuming 'result' is your DataFrame and 'message_clean' is the column containing the cleaned email text
# Apply the UDF to create a new column 'label'
result = result.withColumn('label', label_udf(col('message_clean')))

# Now proceed with data preparation steps such as splitting the dataset into training and test sets
(train_data, test_data) = result.randomSplit([0.8, 0.2], seed=42)

# Ensure your model training code below is correctly referring to 'features' and 'label' columns
# For example:
from pyspark.ml.classification import LogisticRegression

# Initialize the classifier, assuming 'features' column contains vectorized features
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Train the model on the training data
lrModel = lr.fit(train_data)

# Make predictions on the test data
predictions = lrModel.transform(test_data)

# Evaluate the model if necessary
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

print(f"Test AUC: {auc}")

# Save the predictions to a CSV file
# Replace 'path/to/save/predictions.csv' with the actual path where you want to save the file
predictions.write.option("header", "true").csv('/mnt/data/predictions.csv')


24/03/22 08:08:19 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:09:28 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:36 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:36 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:37 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:38 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:39 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:39 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:40 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:40 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:41 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/03/22 08:10:42 WARN DAGScheduler: Broadcasting larg

Test AUC: 0.9982875492536734


AnalysisException: Column `words` has a data type of array<string>, which is not supported by CSV.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Placeholder: Load your data here
# For example, let's assume you've loaded and prepared your datasets into these variables
# X_train, X_test, y_train, y_test = load_and_preprocess_your_data()

# Example placeholder data - replace with your actual data
X_train = np.random.randint(0, 10000, (1000, 100))  # Random data for illustration
y_train = np.random.randint(0, 2, (1000, ))  # Random binary labels
X_test = np.random.randint(0, 10000, (200, 100))  # Random data for illustration
y_test = np.random.randint(0, 2, (200, ))  # Random binary labels

# Define your LSTM model architecture
model = Sequential([
    Embedding(input_dim=10000,  # Size of your vocabulary
              output_dim=128,  # Dimension of the dense embedding
              input_length=100),  # Length of input sequences
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Assuming binary classification
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# Save the model for later use
model.save('path_to_my_model.h5')


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop
import numpy as np

# Placeholder: Load your data here

# Example placeholder data - replace with your actual data
X_train = np.random.randint(0, 10000, (1000, 100))  # Random data for illustration
y_train = np.random.randint(0, 2, (1000, ))  # Random binary labels
X_test = np.random.randint(0, 10000, (200, 100))  # Random data for illustration
y_test = np.random.randint(0, 2, (200, ))  # Random binary labels

# Define your LSTM model architecture
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.2),  # Increased complexity and added dropout
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Using RMSprop optimizer and setting a learning rate
optimizer = RMSprop(lr=0.001)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Train the model with early stopping
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# Save the model for later use
model.save('path_to_my_model.h5')


In [None]:
from tensorflow.keras.models import load_model

model = load_model('path_to_my_model.h5')


In [None]:
# Fill None values with an empty string in the 'message_clean' column
df_emails_pd['message_clean'] = df_emails_pd['message_clean'].fillna('')

# Now proceed with fitting the tokenizer
tokenizer.fit_on_texts(df_emails_pd['message_clean'])

# Continue with the rest of your code...

# Assuming 'message' is the column with email content that you want to preprocess and predict
df_emails = df_emails.withColumnRenamed('message', 'message_clean')  # Only if renaming is necessary

# Convert Spark DataFrame to Pandas DataFrame for processing with Keras
df_emails_pd = df_emails.toPandas()

# Initialize and fit the tokenizer on the 'message_clean' column
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_emails_pd['message_clean'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df_emails_pd['message_clean'])

# Pad sequences
data = pad_sequences(sequences, maxlen=maxlen)

# Predict using the model
predictions = model.predict(data)

# Add predictions back to the Pandas DataFrame (as a new column)
df_emails_pd['fraud_probability'] = predictions

# If you want to work with Spark DataFrame afterward, convert it back
df_emails_updated = spark.createDataFrame(df_emails_pd)

# Show some predictions
df_emails_updated.select('message_clean', 'fraud_probability').show()


In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Load your saved model
model = load_model('path_to_my_model.h5')

# Assuming df_emails is a Pandas DataFrame and has been preprocessed
# If df_emails is a Spark DataFrame, you'll need to convert it to Pandas DataFrame
# df_emails = spark_df.toPandas()  # Only if starting with a Spark DataFrame

# Tokenization and sequence padding parameters
# These should match the parameters used during training
max_words = 10000  # This should match the tokenizer setting during training
maxlen = 100  # This should match the sequence padding length during training

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_emails['message_clean'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df_emails['message_clean'])

# Pad sequences
data = pad_sequences(sequences, maxlen=maxlen)

# Predict using the model
predictions = model.predict(data)

# Add predictions back to the DataFrame (as a new column)
df_emails['fraud_probability'] = predictions

# Visualize some of the predictions
print(df_emails[['message_clean', 'fraud_probability']].head())

# If you need to convert back to Spark DataFrame
# df_emails_spark = spark.createDataFrame(df_emails)


In [None]:
#Splitting the Data Frame Linux Terminal

In [None]:
# Split the CSV into 4 parts with a prefix 'data_chunk_'
# The -n l/4 option splits the file into 4 equal parts, by lines
#This will create files named data_chunk_00, data_chunk_01,

###   split -n l/10 -d your_large_file.csv data_chunk_


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("EmailClassification") \
    .getOrCreate()

# Replace the path below with the path to your Hadoop-stored file
df_emails = spark.read.csv("hdfs:///user1/data_batch_00", inferSchema=True, header=True)


In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF

# Assuming the text column is named 'message'
# Tokenize the email content
tokenizer = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W")
tokenized_df = tokenizer.transform(df_emails)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = remover.transform(tokenized_df)

# Continue with further processing as needed


In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# Assuming 'df_emails' is your initial DataFrame and 'message' is the column with email texts
# Ensure 'df_emails' has been defined correctly and contains the 'message' column

# Tokenize the email content
tokenizer = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W")
tokenized_df = tokenizer.transform(df_emails)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = remover.transform(tokenized_df)

# Check if the 'filtered_words' column exists
filtered_df.printSchema()

# If everything is correct up to here, then converting to a Pandas DataFrame should work
pandas_df = filtered_df.select("filtered_words").toPandas()


In [None]:
from pyspark.sql.functions import col
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# Filter out rows where the message column is null
df_non_null = df_emails.filter(col("message").isNotNull())

# Tokenize the email content
tokenizer = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W")
tokenized_df = tokenizer.transform(df_non_null)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = remover.transform(tokenized_df)

# Continue with your processing...


In [None]:
# Convert to Pandas DataFrame
pandas_df = filtered_df.select("filtered_words").toPandas()

# Now, use Keras' text processing tools as shown previously to tokenize and pad sequences


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Placeholder for your data
texts = ["Your preprocessed email texts here..."]  # This should be a list of email texts
labels = [0, 1, 0, 1]  # Binary labels for each text

tokenizer = Tokenizer(num_words=10000)  # num_words is the size of your vocabulary
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=100)  # maxlen is the length of the longest sequence

# Assuming labels are already prepared
import numpy as np
labels = np.asarray(labels)


In [None]:
print(f"Data shape: {data.shape}")
print(f"Labels shape: {labels.shape}")


In [None]:
import numpy as np

# Example placeholder, replace these with actual preprocessed data and labels
texts = ["sample text 1", "sample text 2", "sample text 3", "sample text 4"]
labels = [1, 0, 1, 0]

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=100)  # Ensure this matches your sequence length

labels = np.array(labels)

print(f"Data shape: {data.shape}")
print(f"Labels shape: {labels.shape}")

# Now proceed with train_test_split
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=16, input_length=100),  # input_dim is the size of the vocabulary, output_dim is the dimension of the dense embedding
    GlobalAveragePooling1D(),  # This will average the embeddings of all words in the sequence
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')  # Assuming binary classification (0 or 1 labels)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_val, y_val), batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=100, input_length=100),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_val, y_val), batch_size=32)


In [None]:
def visualize_fraud_predictions(email_texts, predictions):
    """
    Visualize the emails with their predicted fraud probabilities.

    Parameters:
    - email_texts: List of email text content.
    - predictions: List of predicted probabilities corresponding to the fraud likelihood of each email.

    The function doesn't return anything but prints each email with its fraud prediction.
    """
    # Ensure the lists have the same length
    if len(email_texts) != len(predictions):
        print("Error: The length of email_texts and predictions must match.")
        return
    
    for email, probability in zip(email_texts, predictions):
        print("Email Content:\n", email)
        # Assuming the probability is given in a scale from 0 to 1
        print("Fraud Likelihood: {:.2%}".format(probability))
        print("-" * 100)

# Assuming the predictions list matches the number of email_texts provided
predicted_probabilities = [0.34725702, 0.08264993]

# Example usage with matching lengths for email_texts and predicted_probabilities
visualize_fraud_predictions(email_texts, predicted_probabilities)


In [None]:
def filter_emails_by_similarity_and_likelihood(emails, similarity_threshold=0.5, likelihood_threshold=8.0):
    """
    Filters emails based on content similarity to a given phrase and a likelihood threshold.
    
    Parameters:
    - emails: List of dictionaries, where each dictionary contains 'content' and 'likelihood' keys.
    - similarity_threshold: A threshold for determining content similarity (not used in this simple example).
    - likelihood_threshold: The minimum likelihood score for an email to be considered suspicious.
    
    Returns:
    - A list of emails considered suspicious based on the likelihood threshold.
    """
    suspicious_phrase = "Another suspicious email detected!"
    filtered_emails = [email for email in emails if suspicious_phrase in email['content'] and email['likelihood'] >= likelihood_threshold]
    return filtered_emails

# Example usage:
emails = [
    {'content': "This is a normal email content.", 'likelihood': 2.0},
    {'content': "Another suspicious email detected! Please check it out.", 'likelihood': 8.26},
    {'content': "Another suspicious email detected! This seems like a scam.", 'likelihood': 9.5},
    {'content': "This is another normal conversation.", 'likelihood': 3.2}
]

# Filtering emails:
suspicious_emails = filter_emails_by_similarity_and_likelihood(emails, likelihood_threshold=8.0)

# Displaying the filtered, suspicious emails:
for email in suspicious_emails:
    print(f"Email Content: {email['content']}")
    print(f"Fraud Likelihood: {email['likelihood']}%")
    print("-"*80)
