In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pickle


# Load the dataset
data = pd.read_csv("/Users/sahejsinghsodhi/Downloads/GitHub/go-phish-/training and classifying/Email Detection/Phishing_Email.csv.zip")

# Data preprocessing
data = data.drop(columns={"Unnamed: 0"})  # Remove unnecessary column
data = data.dropna()  # Drop any rows with missing values

# TF-IDF Transformation
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)  # Limit to 1000 features for efficiency
X = tfidf.fit_transform(data['Email Text'])  # Fit and transform the 'Email Text' column

# Label Encoding for 'Email Type' (assuming this is the department or target label)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Email Type'])  # Encoding target labels (departments)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

with open('model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

In [None]:
# Now, let's predict the department for the new email
new_email = """Act fast: Limited stock available for our new product!

Don’t miss out on this unique chance to win big! Click now to participate.

For more details, visit our website or contact us directly."""

# Transform the new email using the already fitted vectorizer (on training data)
X_new = tfidf.transform([new_email])  # Use transform() for test data

# Make a prediction using the trained Random Forest model
prediction = rf_model.predict(X_new)

# Print the prediction result (department)
predicted_department = label_encoder.inverse_transform(prediction)  # Convert numerical prediction back to department name
print("Predicted department:", predicted_department[0])
