# E-Discovery with Neural Network
## BigData Processing
### Hadoop / PySpark 

In [1]:
# Commands to be include in the Linux Terminal in order to start Hadoop and PySpark

#start-dfs.sh
#start-yarn.sh
#pyspark

### PySpark reading a file form a CSV

In [2]:
# Initiating required modules
from pyspark.sql import SparkSession  # Essential for Spark operations
from pyspark.sql.functions import udf  # User Defined Functions
from pyspark.sql.types import StringType  # Data type for string operations
import re  # Regular expressions library

# Handling warnings
import warnings  # Importing warnings library
warnings.filterwarnings("ignore")  # Suppress all warnings


In [3]:
# Setting up the Spark Session
spark = SparkSession.builder.appName("EmailDataPrep").getOrCreate()

# Reading the data
df = spark.read.csv("hdfs:///user1/data_batch_00", header=True, inferSchema=True)


24/03/25 00:25:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [4]:
### Exploratory Data Analysis (EDA) / Data Inspection

In [5]:
# Preview the initial rows of the DataFrame
df.show(n=10)

# Output the structure and data types of the DataFrame
df.printSchema()


+--------------------+--------------------+
|                file|             message|
+--------------------+--------------------+
|allen-p/_sent_mai...|Message-ID: <1878...|
|           Date: Mon| 14 May 2001 16:3...|
|From: phillip.all...|                null|
|To: tim.belden@en...|                null|
|           Subject: |                null|
|   Mime-Version: 1.0|                null|
|Content-Type: tex...|                null|
|Content-Transfer-...|                null|
|X-From: Phillip K...|                null|
|X-To: Tim Belden ...|                null|
+--------------------+--------------------+
only showing top 10 rows

root
 |-- file: string (nullable = true)
 |-- message: string (nullable = true)



In [6]:
# Obtain and display descriptive statistics for columns with numerical data
df.describe().show()




+-------+--------------------+--------+
|summary|                file| message|
+-------+--------------------+--------+
|  count|             2648165|  773905|
|   mean|                 NaN|Infinity|
| stddev|                 NaN|     NaN|
|    min|                  \t|      \t|
|    max|~~~~~~~~~~~~~~~~~...|       ||
+-------+--------------------+--------+



                                                                                

### Data Cleaning / Text Preprocessing

In [7]:
# Addressing Missing Information
# Replace any missing values in the dataset with an empty string.
df = df.fillna('')

# Defining a Function for Text Cleanup
def clean_text(text):
    # Return an empty string if the input text is missing.
    if text is None:
        return ''
    # Change all characters in the text to lowercase for uniformity.
    text = text.lower()
    # Eliminate lines that resemble email headers or unneeded metadata.
    text = re.sub(r'^[a-z]+:.*$', '', text)  # Finds and removes lines starting with certain patterns.
    text = re.sub(r'\s+', ' ', text)  # Collapses multiple spaces into a single space.
    # Remove characters that are not letters, numbers, or spaces.
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Trim leading or trailing spaces from the text.
    return text.strip()

# Making the Text Cleanup Function Available
clean_text_udf = udf(clean_text, StringType())

# Cleaning Text in the 'message' Column of the Dataset
df = df.withColumn('message', clean_text_udf(df['message']))

# Displaying the Processed Data for Review
df.show(truncate=False)


+----------------------------------------------+-------------------------------------------------+
|file                                          |message                                          |
+----------------------------------------------+-------------------------------------------------+
|allen-p/_sent_mail/1.                         |messageid 187829811075855378110javamailevansthyme|
|Date: Mon                                     |14 may 2001 163900 0700 pdt                      |
|From: phillip.allen@enron.com                 |                                                 |
|To: tim.belden@enron.com                      |                                                 |
|Subject:                                      |                                                 |
|Mime-Version: 1.0                             |                                                 |
|Content-Type: text/plain; charset=us-ascii    |                                                 |
|Content-T

[Stage 6:>                                                          (0 + 1) / 1]                                                                                

### Feature Engineering and Vectorization

In [8]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

def add_text_features(dataframe):
    """
    Enhances a DataFrame by analyzing text in the 'message' column and adding a 'features' column.
    This 'features' column represents text as numerical data useful for machine learning.
    
    Parameter:
    - dataframe: A DataFrame that includes a 'message' column with text.
    
    Output:
    - A DataFrame that now includes a 'features' column with text analyzed into TF-IDF vectors.
    """
    # First, break text into individual words.
    tokenizer = Tokenizer(inputCol="message", outputCol="words")
    
    # Second, filter out common, less meaningful words (like 'the', 'a').
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    
    # Third, count how frequently each word appears and represent this as numerical data.
    hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
    
    # Fourth, assess the importance of a word based on how frequently it appears across all messages.
    idf = IDF(inputCol="raw_features", outputCol="features")
    
    # Chain these steps into a process to be applied.
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
    
    # Execute the process on the provided data.
    model = pipeline.fit(dataframe)
    
    # Apply the transformations to the DataFrame.
    result = model.transform(dataframe)
    
    # The modified DataFrame is returned, now with a 'features' column.
    return result

# Example of applying this function to a DataFrame 'df' that contains a 'message' column.
enhanced_df = add_text_features(df)

# Display the original messages and their corresponding numerical representations.
enhanced_df.select('message', 'features').show(truncate=False)


                                                                                

+-------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
|message                                          |features                                                                                                          |
+-------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
|messageid 187829811075855378110javamailevansthyme|(20,[12,14],[0.24635006473684198,3.4364679300759433])                                                             |
|14 may 2001 163900 0700 pdt                      |(20,[0,1,5,12],[6.9287616242614884,6.099049147035107,3.3109247162144797,0.24635006473684198])                     |
|                                                 |(20,[12],[0.24635006473684198])                                                                                   

### Designing the Neural Network

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

# Set the number of input features, matching the TF-IDF vector size, which is 20 in this case.
input_dim = 20  # Matches TF-IDF vector size.

# Create a neural network model.
model = Sequential()
# Add a dense layer with 64 neurons, using ReLU activation function, suitable for the input layer.
model.add(Dense(64, input_dim=input_dim, activation='relu'))  # Adds first hidden layer.
# Include a dropout layer to reduce overfitting by randomly ignoring some neurons during training.
model.add(Dropout(0.5))  # Helps in preventing overfitting.
# Add another dense layer, this time with 32 neurons, also with ReLU activation function.
model.add(Dense(32, activation='relu'))  # Adds second hidden layer.
# Finalize with an output layer having a single neuron, using sigmoid activation function for binary classification.
model.add(Dense(1, activation='sigmoid'))  # Adds output layer for binary classification.

# Setting up the model with binary crossentropy loss function, Adam optimizer for efficient learning, and tracking accuracy.
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Display a summary to review the model's architecture, including layer types and number of parameters.
model.summary()


2024-03-25 00:26:04.971846: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 00:26:05.246299: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 00:26:05.246412: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 00:26:05.301027: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-25 00:26:05.414995: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 00:26:05.416870: I tensorflow/core/platform/cpu_feature_guard.cc:1

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1344      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3457 (13.50 KB)
Trainable params: 3457 (13.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
# Design and Training - ANN Model

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential, save_model
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

def build_neural_network(input_features):
    """
    Prepares a neural network model with a basic configuration.
    """
    model = Sequential([
        Dense(64, input_dim=input_features, activation='relu'),  # Layer with 64 neurons
        Dropout(0.5),  # Helps reduce overfitting
        Dense(32, activation='relu'),  # Another layer for deeper understanding
        Dense(1, activation='sigmoid')  # Final layer for binary classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate_model(features, labels, input_feature_count, epochs=10, batch_size=32, validation_split=0.2):
    """
    Trains the neural network and evaluates it on a validation set.
    """
    # Splitting the dataset for training and validation
    X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=validation_split, random_state=42)
    
    # Getting the model ready
    model = build_neural_network(input_feature_count)
    
    # Training starts
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_val, y_val), verbose=1)
    
    print("Training Completed.")
    
    # Saving the model for later use
    model.save("model_saved.h5")
    print("Model saved as model_saved.h5.")
    
    return model, history

# Sample data for testing
demo_features = np.random.rand(1000, 20)  # Random features for 1000 samples
demo_labels = np.random.randint(2, size=(1000, ))  # Random binary labels

# Training the model with the demo data
input_features = 20
model, history = train_and_evaluate_model(demo_features, demo_labels, input_features)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Completed.
Model saved as model_saved.h5.


In [24]:
from keras.models import load_model
import numpy as np

def get_fraud_chance(model_file, data):
    """
    This function takes a path to a trained machine learning model and an array of data,
    and it returns the model's estimates of the likelihood that each data point is fraudulent.
    
    Arguments:
    - model_file: The location of the trained model on the disk.
    - data: Array of data points that the model will evaluate. This array should be formatted just like the data used to train the model.
    
    Outcome:
    - An array with each data point's estimated fraud likelihood according to the model.
    """
    # Opening the model from the given file
    trained_model = load_model(model_file)
    
    # Making predictions with the model on the provided data
    fraud_chances = trained_model.predict(data)
    
    return fraud_chances

# Example: Predicting with a hypothetical model
# Setting up variables for demonstration. The real values depend on your specific model and data.
sample_count = 100  # Suppose you have 100 pieces of data
expected_shape = (20,)  # Suppose each piece of data should have 20 features

# Creating random data for the demonstration. In real use, you'd have actual data here.
data_samples = np.random.rand(sample_count, *expected_shape)

# Calling the function with the model's file path and the sample data
predicted_fraud_chances = get_fraud_chance('model_saved.h5', data_samples)

# Showing the model's fraud likelihood estimates for the sample data
print(predicted_fraud_chances)


[[0.5222649 ]
 [0.48793185]
 [0.59242105]
 [0.5677292 ]
 [0.44738236]
 [0.53219515]
 [0.608353  ]
 [0.5523491 ]
 [0.5050088 ]
 [0.55619997]
 [0.48314068]
 [0.47807273]
 [0.4988079 ]
 [0.39045498]
 [0.49380124]
 [0.53723353]
 [0.544633  ]
 [0.5857475 ]
 [0.51469785]
 [0.44569087]
 [0.41720706]
 [0.58551663]
 [0.55448085]
 [0.4868994 ]
 [0.5363467 ]
 [0.49372327]
 [0.5461453 ]
 [0.5274704 ]
 [0.5307675 ]
 [0.48310813]
 [0.49978566]
 [0.48093283]
 [0.46870962]
 [0.44529694]
 [0.45683783]
 [0.5003139 ]
 [0.44207197]
 [0.46674752]
 [0.4839505 ]
 [0.5050704 ]
 [0.53307766]
 [0.5167742 ]
 [0.54793715]
 [0.4761839 ]
 [0.4187648 ]
 [0.45370808]
 [0.438829  ]
 [0.46619377]
 [0.47846073]
 [0.53757185]
 [0.42602292]
 [0.44725013]
 [0.50455093]
 [0.5135777 ]
 [0.47086152]
 [0.61031413]
 [0.5507193 ]
 [0.4578838 ]
 [0.56053734]
 [0.5540934 ]
 [0.56648207]
 [0.44663766]
 [0.5438114 ]
 [0.47412613]
 [0.5779174 ]
 [0.5145041 ]
 [0.45846006]
 [0.4920599 ]
 [0.4590358 ]
 [0.5489826 ]
 [0.51619035]
 [0.60

In [None]:
# Design and Training - CNN Model

In [13]:
from keras.models import Sequential, save_model
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import numpy as np

def configure_cnn_model(input_length, num_features):
    """Configures a basic one-dimensional Convolutional Neural Network (CNN)."""
    model = Sequential([
        Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(input_length, num_features)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def prepare_and_train_model(input_length=12, num_features=20, epochs=10, batch_size=32, test_split=0.2, sample_size=100):
    """
    Generates synthetic data and uses it to train the CNN model.
    This method streamlines the process of dataset creation and model training.
    """
    # Generating synthetic data
    features = np.random.rand(sample_size, input_length, num_features)
    labels = np.random.randint(2, size=(sample_size,))

    # Splitting the dataset into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=test_split, random_state=42)
    
    # Setting up the CNN model
    model = configure_cnn_model(input_length, num_features)
    
    # Training the model with the synthetic data
    training_summary = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), verbose=1)
    
    # Displaying training and validation results
    print(f"Final Training Loss: {training_summary.history['loss'][-1]:.4f}")
    print(f"Final Training Accuracy: {training_summary.history['accuracy'][-1]:.4f}")
    print(f"Final Validation Loss: {training_summary.history['val_loss'][-1]:.4f}")
    print(f"Final Validation Accuracy: {training_summary.history['val_accuracy'][-1]:.4f}")
    
    # Saving the trained model to a file
    model.save('cnn_model.h5')
    print("Model training complete and saved as cnn_model.h5.")

# Running the training process with predefined settings
prepare_and_train_model()


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Final Training Loss: 0.6584
Final Training Accuracy: 0.5625
Final Validation Loss: 0.7008
Final Validation Accuracy: 0.6000
Model training complete and saved as cnn_model.h5.


In [28]:
from keras.models import load_model
import numpy as np
import pandas as pd

def evaluate_fraud_risk(model_file, input_data):
    """
    This function opens a machine learning model that's been trained to identify fraud.
    It examines each data point provided to estimate its likelihood of being fraudulent.
    The results are organized into a table, making it easier to understand the model's assessments.
    
    Arguments:
    - model_file: The file location of the machine learning model.
    - input_data: Data for the model to evaluate, formatted as an array. This array must be prepared in the same way as the data used for training the model.
    
    Outcome:
    - A table (DataFrame) showing each data point with its calculated fraud risk.
    """
    # Opening the specified machine learning model
    trained_model = load_model(model_file)
    
    # Using the model to estimate fraud risk for the provided data
    estimated_risk = trained_model.predict(input_data).flatten()
    
    # Organizing the risk estimates into a table
    risk_table = pd.DataFrame({
        'Data Point': np.arange(1, len(estimated_risk) + 1), 
        'Estimated Fraud Risk': estimated_risk
    })
    
    # Sharing the organized table
    return risk_table

# For the purpose of demonstration, synthetic data resembling the model's training format is created
sample_size = 100  # Suppose there are 100 data points to evaluate
sequence_length = 12  # Each data point is a sequence of 12 steps
feature_count = 20  # Each step in the sequence has 20 features
simulated_data = np.random.rand(sample_size, sequence_length, feature_count)

# Predicting fraud risk with the provided model file and simulated data
fraud_risk_table = evaluate_fraud_risk('cnn_model.h5', simulated_data)
# Showing the first 40 data points and their estimated fraud risk for clarity
print(fraud_risk_table.tail(40))


    Data Point  Estimated Fraud Risk
60          61              0.465374
61          62              0.493880
62          63              0.443990
63          64              0.429173
64          65              0.466889
65          66              0.460390
66          67              0.463025
67          68              0.402221
68          69              0.448434
69          70              0.480064
70          71              0.487817
71          72              0.450295
72          73              0.474668
73          74              0.549166
74          75              0.446582
75          76              0.462259
76          77              0.442445
77          78              0.431418
78          79              0.423260
79          80              0.434580
80          81              0.440837
81          82              0.501706
82          83              0.459737
83          84              0.465574
84          85              0.446140
85          86              0.446018
8

In [None]:
ddddd

In [None]:
d