In [None]:
#imports
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
import torch
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Functions for preprocessing text
def preprocess_text(text):
    if not isinstance(text, str): return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


# Loading and combining datasets
df_simple = df[['First_prompt', 'First_response', 'First_label']].copy()
df_labeled = df_simple[df_simple['First_label'].notna()]
csv_files = glob.glob('imported response data/*.csv')
df_combined = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
df_combined = df_combined.rename(columns={
    'Prompt ': 'First_prompt',
    'Response ': 'First_response',
    'Response Category (FP, FN, TP, TN)': 'First_label'
})
df_labeled = pd.concat([df_labeled, df_combined], ignore_index=True)
df_labeled.shape
# Cleaning text
df_labeled['cleaned_prompt'] = df_labeled['First_prompt'].apply(preprocess_text)
df_labeled['cleaned_response'] = df_labeled['First_response'].apply(preprocess_text)

# Embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

prompt_embeddings = np.vstack([get_bert_embeddings(prompt) for prompt in df_labeled['cleaned_prompt']])
response_embeddings = np.vstack([get_bert_embeddings(response) for response in df_labeled['cleaned_response']])
combined_embeddings = prompt_embeddings + response_embeddings
# Handling labels
df_labeled['First_label'] = df_labeled['First_label'].fillna('FN')  # Default missing labels to FN
df_encoded = pd.get_dummies(df_labeled, columns=['First_label'], prefix='First_label')

# SVM Model Training
df_encoded['combined_text'] = df_labeled['cleaned_prompt'] + " " + df_labeled['cleaned_response']
y = df_labeled['First_label']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(combined_embeddings, y_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report for SVM:")
print(classification_report(y_test, y_pred))
#SVM tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear'],
    'gamma': ['scale', 'auto']
}
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate on Test Data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

# LSTM Model Training
X = combined_embeddings
y = df_encoded[['First_label_FN', 'First_label_FP', 'First_label_TN', 'First_label_TP']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

lstm_model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

loss, accuracy = lstm_model.evaluate(X_test, y_test, verbose=1)
print(f"LSTM Test Loss: {loss}, Test Accuracy: {accuracy}")
#Random Forest
X_train, X_test, y_train, y_test = train_test_split(combined_embeddings, y_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
#random forest tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and evaluate
print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

# Predict and evaluate using the best model
y_pred_best = best_rf_model.predict(X_test)
print(f"Accuracy (Best Model): {accuracy_score(y_test, y_pred_best)}")
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))
#Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(combined_embeddings, y_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)

y_pred = logreg_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))