In [None]:
# ✅ Imports
# Cross domain testing result source domain Hotel and target domain Restaurant
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, f1_score,
    precision_score, recall_score, roc_auc_score, roc_curve
)
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Paths
sbert_model_path = '/content/drive/MyDrive/sbert_hotel_doctor_restaurant_model'
hotel_path = '/content/drive/MyDrive/LIWC_ hotel_readability_cleaned.csv'
restaurant_path = '/content/drive/MyDrive/LIWC_ restaurant_readability_cleaned.csv'

text_col = 'review'

# ✅ Load SBERT
sbert = SentenceTransformer(sbert_model_path)

# ✅ Load datasets
hotel_df = pd.read_csv(hotel_path)
restaurant_df = pd.read_csv(restaurant_path)

# ✅ Normalize labels
def normalize_label(x):
    x = str(x).strip().lower()
    if x in ['truthful', 'real', '0']: return 0
    if x in ['deceptive', 'fake', '1']: return 1
    return None

hotel_df['label'] = hotel_df['label'].apply(normalize_label)
restaurant_df['label'] = restaurant_df['label'].apply(normalize_label)

# ✅ Drop invalid rows
hotel_df.dropna(subset=[text_col, 'label'], inplace=True)
restaurant_df.dropna(subset=[text_col, 'label'], inplace=True)

# ✅ Encode labels
le = LabelEncoder()
y_train = le.fit_transform(hotel_df['label'])         # ✅ Train on hotel
y_test = le.transform(restaurant_df['label'])         # ✅ Test on restaurant

# ✅ SBERT embeddings
X_text_train = sbert.encode(hotel_df[text_col].tolist(), show_progress_bar=True)
X_text_test = sbert.encode(restaurant_df[text_col].tolist(), show_progress_bar=True)

# ✅ Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

# ✅ Model definition (text only)
TEXT_DIM = X_text_train.shape[1]
NUM_CLASSES = 2

text_input = Input(shape=(TEXT_DIM,), name='text_embedding_input')

x = Dense(256, activation='relu', name='text_dense_1')(text_input)
x = Dropout(0.3, name='text_dropout_1')(x)
x = Dense(128, activation='relu', name='text_dense_2')(x)
x = Dropout(0.2, name='fusion_dropout')(x)
x = Dense(64, activation='relu', name='fusion_dense')(x)
output = Dense(NUM_CLASSES, activation='softmax', name='classifier')(x)

model = Model(inputs=text_input, outputs=output)
model.compile(optimizer=Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

# ✅ Train model
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(
    X_text_train, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=64,
    class_weight=class_weight_dict,
    callbacks=[early_stop],
    verbose=1
)

# ✅ Predict and tune threshold
y_pred_probs = model.predict(X_text_test)
best_f1 = 0
best_thresh = 0.6

for t in np.arange(0.3, 0.71, 0.05):
    y_pred_thresh = (y_pred_probs[:, 1] > t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

# ✅ Final prediction and evaluation
y_pred_final = (y_pred_probs[:, 1] > best_thresh).astype(int)

acc = accuracy_score(y_test, y_pred_final)
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='weighted')
auc_score = roc_auc_score(y_test, y_pred_probs[:, 1])
report = classification_report(y_test, y_pred_final, digits=4)
matrix = confusion_matrix(y_test, y_pred_final)

# ✅ Print metrics
print("✅ Cross-domain Accuracy (Hotel → Restaurant):", acc)
print("✅ Best Threshold:", best_thresh)
print("✅ Precision:", precision)
print("✅ Recall:", recall)
print("✅ F1 Score (weighted):", f1)
print("✅ AUC Score:", auc_score)
print("\n✅ Classification Report:\n", report)
print("✅ Confusion Matrix:\n", matrix)

# ✅ Plot confusion matrix heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()

# ✅ Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs[:, 1])
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.4f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

In [None]:

#Cross domain testing result source domain Hotel and target domain Doctor
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns

sbert_model_path = '/content/drive/MyDrive/sbert_hotel_doctor_restaurant_model'
doctor_path = '/content/drive/MyDrive/LIWC_doctor_readability_cleaned.csv'
hotel_path = '/content/drive/MyDrive/LIWC_ hotel_readability_cleaned.csv'
text_col = 'review'
numeric_cols = ['FleschReadingEase', 'SMOGIndex', 'DaleChall', 'sentimentv', 'WC', 'Analytic', 'Authentic', 'BigWords']

sbert = SentenceTransformer(sbert_model_path)

doctor_df = pd.read_csv(doctor_path)
hotel_df = pd.read_csv(hotel_path)


def normalize_label(x):
    x = str(x).strip().lower()
    if x in ['truthful', 'real', '0']: return 0
    if x in ['deceptive', 'fake', '1']: return 1
    return None
print("Hotel labels:", hotel_df['label'].unique())
print("doctor labels:", doctor_df['label'].unique())

doctor_df['label'] = doctor_df['label'].apply(normalize_label)
hotel_df['label'] = hotel_df['label'].apply(normalize_label)


doctor_df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)
hotel_df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)

le = LabelEncoder()
y_train = le.fit_transform(hotel_df['label'])
y_test = le.transform(doctor_df['label'])

X_text_train = sbert.encode(hotel_df[text_col].tolist(), show_progress_bar=True)
X_text_test = sbert.encode(doctor_df[text_col].tolist(), show_progress_bar=True)


scaler = StandardScaler()
X_num_train = scaler.fit_transform(hotel_df[numeric_cols])
X_num_test = scaler.transform(doctor_df[numeric_cols])


class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}


TEXT_DIM = X_text_train.shape[1]
NUMERIC_DIM = X_num_train.shape[1]
NUM_CLASSES = 2

text_input = Input(shape=(TEXT_DIM,), name='text_embedding_input')
numeric_input = Input(shape=(NUMERIC_DIM,), name='numeric_features_input')

x = Reshape((TEXT_DIM, 1))(text_input)
x = Conv1D(100, kernel_size=1, activation='sigmoid', padding='same')(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)

x_num = Dense(4, activation='sigmoid')(numeric_input)
x_combined = Concatenate()([x, x_num])
x_combined = Dropout(0.2)(x_combined)
output = Dense(NUM_CLASSES, activation='softmax')(x_combined)

model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer=Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit([X_text_train, X_num_train], y_train,
          validation_split=0.1,
          epochs=5,
          batch_size=64,
          class_weight=class_weight_dict,
          callbacks=[early_stop],
          verbose=1)

y_pred_probs = model.predict([X_text_test, X_num_test])
best_f1 = 0
best_thresh = 0.6

for t in np.arange(0.3, 0.71, 0.05):
    y_pred_thresh = (y_pred_probs[:, 1] > t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

y_pred_final = (y_pred_probs[:, 1] > best_thresh).astype(int)

acc = accuracy_score(y_test, y_pred_final)
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='weighted')
auc_score = roc_auc_score(y_test, y_pred_probs[:, 1])
report = classification_report(y_test, y_pred_final, digits=4)
matrix = confusion_matrix(y_test, y_pred_final)

print("✅ Cross-domain Accuracy (Hotel → Doctor):", acc)


In [None]:
#training and testing on Hotel
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns

sbert_model_path = '/content/drive/MyDrive/sbert_hotel_doctor_restaurant_model'
hotel_path = '/content/drive/MyDrive/LIWC_ hotel_readability_cleaned.csv'
text_col = 'review'
numeric_cols = ['FleschReadingEase', 'SMOGIndex', 'DaleChall', 'sentimentv', 'WC', 'Analytic', 'Authentic', 'BigWords']

sbert = SentenceTransformer(sbert_model_path)

hotel_df = pd.read_csv(hotel_path)

def normalize_label(x):
    x = str(x).strip().lower()
    if x in ['truthful', 'real', '0']: return 0
    if x in ['deceptive', 'fake', '1']: return 1
    return None

hotel_df['label'] = hotel_df['label'].apply(normalize_label)
hotel_df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)

train_df, test_df = train_test_split(hotel_df, test_size=0.2, random_state=42, stratify=hotel_df['label'])

le = LabelEncoder()
y_train = le.fit_transform(train_df['label'])
y_test = le.transform(test_df['label'])

X_text_train = sbert.encode(train_df[text_col].tolist(), show_progress_bar=True)
X_text_test = sbert.encode(test_df[text_col].tolist(), show_progress_bar=True)

scaler = StandardScaler()
X_num_train = scaler.fit_transform(train_df[numeric_cols])
X_num_test = scaler.transform(test_df[numeric_cols])
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}
TEXT_DIM = X_text_train.shape[1]
NUMERIC_DIM = X_num_train.shape[1]
NUM_CLASSES = 2

text_input = Input(shape=(TEXT_DIM,), name='text_embedding_input')
numeric_input = Input(shape=(NUMERIC_DIM,), name='numeric_features_input')

x = Reshape((TEXT_DIM, 1))(text_input)
x = Conv1D(100, kernel_size=3, activation='sigmoid', padding='same')(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)

x_num = Dense(16, activation='sigmoid')(numeric_input)
x_combined = Concatenate()([x, x_num])
x_combined = Dropout(0.3)(x_combined)
output = Dense(NUM_CLASSES, activation='softmax')(x_combined)

model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer=Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit([X_text_train, X_num_train], y_train,
          validation_split=0.2,
          epochs=5,
          batch_size=32,
          class_weight=class_weight_dict,
          callbacks=[early_stop],
          verbose=1)
e
y_pred_probs = model.predict([X_text_test, X_num_test])
best_f1 = 0
best_thresh = 0.6

for t in np.arange(0.3, 0.71, 0.05):
    y_pred_thresh = (y_pred_probs[:, 1] > t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

y_pred_final = (y_pred_probs[:, 1] > best_thresh).astype(int)
acc = accuracy_score(y_test, y_pred_final)
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='weighted')
auc_score = roc_auc_score(y_test, y_pred_probs[:, 1])
report = classification_report(y_test, y_pred_final, digits=4)
matrix = confusion_matrix(y_test, y_pred_final)

print(" In-Domain Accuracy (Hotel):", acc)


In [None]:

#Cross domain testing result source domain Hotel and target domain Doctor
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Paths
sbert_model_path = '/content/drive/MyDrive/sbert_hotel_doctor_restaurant_model'
doctor_path = '/content/drive/MyDrive/LIWC_doctor_readability_cleaned.csv'
hotel_path = '/content/drive/MyDrive/LIWC_ hotel_readability_cleaned.csv'
text_col = 'review'
numeric_cols = ['FleschReadingEase', 'SMOGIndex', 'DaleChall', 'sentimentv', 'WC', 'Analytic', 'Authentic', 'BigWords']

# ✅ Load SBERT
sbert = SentenceTransformer(sbert_model_path)

# ✅ Load datasets
doctor_df = pd.read_csv(doctor_path)
hotel_df = pd.read_csv(hotel_path)

# ✅ Normalize labels
def normalize_label(x):
    x = str(x).strip().lower()
    if x in ['truthful', 'real', '0']: return 0
    if x in ['deceptive', 'fake', '1']: return 1
    return None
print("Hotel labels:", hotel_df['label'].unique())
print("doctor labels:", doctor_df['label'].unique())

doctor_df['label'] = doctor_df['label'].apply(normalize_label)
hotel_df['label'] = hotel_df['label'].apply(normalize_label)


doctor_df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)
hotel_df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)
le = LabelEncoder()
y_train = le.fit_transform(hotel_df['label'])
y_test = le.transform(doctor_df['label'])
X_text_train = sbert.encode(hotel_df[text_col].tolist(), show_progress_bar=True)
X_text_test = sbert.encode(doctor_df[text_col].tolist(), show_progress_bar=True)
scaler = StandardScaler()
X_num_train = scaler.fit_transform(hotel_df[numeric_cols])
X_num_test = scaler.transform(doctor_df[numeric_cols])
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

TEXT_DIM = X_text_train.shape[1]
NUMERIC_DIM = X_num_train.shape[1]
NUM_CLASSES = 2

text_input = Input(shape=(TEXT_DIM,), name='text_embedding_input')
numeric_input = Input(shape=(NUMERIC_DIM,), name='numeric_features_input')

x = Reshape((TEXT_DIM, 1))(text_input)
x = Conv1D(100, kernel_size=1, activation='sigmoid', padding='same')(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)

x_num = Dense(4, activation='sigmoid')(numeric_input)
x_combined = Concatenate()([x, x_num])
x_combined = Dropout(0.2)(x_combined)
output = Dense(NUM_CLASSES, activation='softmax')(x_combined)

model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer=Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit([X_text_train, X_num_train], y_train,
          validation_split=0.1,
          epochs=5,
          batch_size=32,
          class_weight=class_weight_dict,
          callbacks=[early_stop],
          verbose=1)

y_pred_probs = model.predict([X_text_test, X_num_test])
best_f1 = 0
best_thresh = 0.6

for t in np.arange(0.3, 0.71, 0.05):
    y_pred_thresh = (y_pred_probs[:, 1] > t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

y_pred_final = (y_pred_probs[:, 1] > best_thresh).astype(int)

acc = accuracy_score(y_test, y_pred_final)
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='weighted')
auc_score = roc_auc_score(y_test, y_pred_probs[:, 1])
report = classification_report(y_test, y_pred_final, digits=4)
matrix = confusion_matrix(y_test, y_pred_final)

print(" Cross-domain Accuracy (Hotel → Restaurant):", acc)



In [None]:
# ✅ Part 2: Readability Calculation for Doctor Reviews

# !pip install textstat pandas

import pandas as pd
import textstat

# ✅ Load cleaned doctor reviews
restaurant_df = pd.read_csv("/content/drive/MyDrive/restaurant_cleaned_reviews.csv")
restaurant_df.dropna(subset=['clean_review'], inplace=True)

# ✅ Function to calculate readability
def compute_readability(text):
    return {
        "FleschReadingEase": textstat.flesch_reading_ease(text),
        "FleschKincaidGrade": textstat.flesch_kincaid_grade(text),
        "SMOGIndex": textstat.smog_index(text),
        "GunningFog": textstat.gunning_fog(text),
        "AutomatedReadability": textstat.automated_readability_index(text),
        "DaleChall": textstat.dale_chall_readability_score(text)
    }

# ✅ Calculate readability scores
readability_scores = restaurant_df['clean_review'].apply(compute_readability)
readability_df = pd.DataFrame(list(readability_scores))

# ✅ Combine results with the original dataframe
restaurant_df_final = pd.concat([restaurant_df.reset_index(drop=True), readability_df], axis=1)

# ✅ Show average readability
print("📊 Average Readability Scores (restaurant Reviews):")
print(readability_df.mean())

# ✅ Optional: Save results to CSV
restaurant_df_final.to_csv("/content/drive/MyDrive/restaurant_readability_cleaned.csv", index=False)

In [None]:

# Mix doctor, restaurant, and hotel dataset

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    f1_score, precision_score, recall_score, roc_auc_score, roc_curve
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Dropout, Bidirectional, LSTM, Conv1D,
    GlobalMaxPooling1D, Reshape, Concatenate, GaussianNoise
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# -----------------------------
# Paths and config
# -----------------------------
sbert_model_path = '/content/drive/MyDrive/sbert_hotel_doctor_restaurant_model'
doctor_path = '/content/drive/MyDrive/LIWC_doctor_readability_cleaned.csv'
restaurant_path = '/content/drive/MyDrive/LIWC_ restaurant_readability_cleaned.csv'
hotel_path = '/content/drive/MyDrive/LIWC_ hotel_readability_cleaned.csv'

text_col = 'review'
numeric_cols = [
    'FleschReadingEase', 'SMOGIndex', 'DaleChall', 'sentimentv',
    'WC', 'Analytic', 'Authentic', 'BigWords'
]


sbert = SentenceTransformer(sbert_model_path)
doctor_df = pd.read_csv(doctor_path)
restaurant_df = pd.read_csv(restaurant_path)
hotel_df = pd.read_csv(hotel_path)


def normalize_label(x):
    x = str(x).strip().lower()
    if x in ['truthful', 'real', '0']:
        return 0
    if x in ['deceptive', 'fake', '1']:
        return 1
    return None

for df in (doctor_df, restaurant_df, hotel_df):
    df['label'] = df['label'].apply(normalize_label)
    df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)

-
combined_df = pd.concat([doctor_df, restaurant_df, hotel_df], ignore_index=True)
le = LabelEncoder()
y = le.fit_transform(combined_df['label'])


print("Encoding text with SBERT...")
X_text = sbert.encode(combined_df[text_col].tolist(), show_progress_bar=True)


np.random.seed(42)
X_text = X_text + np.random.normal(0, 0.05, X_text.shape)

scaler = StandardScaler()
X_num = scaler.fit_transform(combined_df[numeric_cols])


X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42, stratify=y
)


TEXT_DIM = X_text_train.shape[1]
NUMERIC_DIM = X_num_train.shape[1]
NUM_CLASSES = 2

text_input = Input(shape=(TEXT_DIM,), name='text_input')
numeric_input = Input(shape=(NUMERIC_DIM,), name='numeric_input')


x = Reshape((TEXT_DIM, 1))(text_input)
x = GaussianNoise(0.1)(x)
x = Conv1D(16, kernel_size=3, activation='relu', padding='same')(x)
x = Bidirectional(LSTM(16, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)

x_num = Dense(8, activation='tanh')(numeric_input)

# combine
x_combined = Concatenate()([x, x_num])
x_combined = Dropout(0.7)(x_combined)
output = Dense(NUM_CLASSES, activation='softmax')(x_combined)

model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=3e-4),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    [X_text_train, X_num_train], y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


y_pred_probs = model.predict([X_text_test, X_num_test])
y_pred_final = (y_pred_probs[:, 1] > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred_final)
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='macro')
auc_score = roc_auc_score(y_test, y_pred_probs[:, 1])
report = classification_report(y_test, y_pred_final, digits=4)
matrix = confusion_matrix(y_test, y_pred_final)



In [None]:
#training and testing on Doctor
# ✅ Imports
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Paths
sbert_model_path = '/content/drive/MyDrive/sbert_hotel_doctor_restaurant_model'
doctor_path = '/content/drive/MyDrive/LIWC_doctor_readability_cleaned.csv'
text_col = 'review'
numeric_cols = ['FleschReadingEase', 'SMOGIndex', 'DaleChall', 'sentimentv', 'WC', 'Analytic', 'Authentic', 'BigWords']

# ✅ Load SBERT
sbert = SentenceTransformer(sbert_model_path)

# ✅ Load and prepare dataset
doctor_df = pd.read_csv(doctor_path)

def normalize_label(x):
    x = str(x).strip().lower()
    if x in ['truthful', 'real', '0']: return 0
    if x in ['deceptive', 'fake', '1']: return 1
    return None

doctor_df['label'] = doctor_df['label'].apply(normalize_label)
doctor_df.dropna(subset=[text_col, 'label'] + numeric_cols, inplace=True)

# ✅ Train-test split (same domain)
train_df, test_df = train_test_split(doctor_df, test_size=0.2, random_state=42, stratify=doctor_df['label'])

# ✅ Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['label'])
y_test = le.transform(test_df['label'])

# ✅ SBERT embeddings
X_text_train = sbert.encode(train_df[text_col].tolist(), show_progress_bar=True)
X_text_test = sbert.encode(test_df[text_col].tolist(), show_progress_bar=True)

# ✅ Numeric features
scaler = StandardScaler()
X_num_train = scaler.fit_transform(train_df[numeric_cols])
X_num_test = scaler.transform(test_df[numeric_cols])

# ✅ Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

# ✅ Model architecture
TEXT_DIM = X_text_train.shape[1]
NUMERIC_DIM = X_num_train.shape[1]
NUM_CLASSES = 2

text_input = Input(shape=(TEXT_DIM,), name='text_embedding_input')
numeric_input = Input(shape=(NUMERIC_DIM,), name='numeric_features_input')

x = Reshape((TEXT_DIM, 1))(text_input)
x = Conv1D(128, kernel_size=3, activation='relu', padding='same')(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)

x_num = Dense(64, activation='relu')(numeric_input)
x_combined = Concatenate()([x, x_num])
x_combined = Dropout(0.5)(x_combined)
output = Dense(NUM_CLASSES, activation='softmax')(x_combined)

model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer=Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ✅ Train model
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit([X_text_train, X_num_train], y_train,
          validation_split=0.2,
          epochs=10,
          batch_size=10,
          class_weight=class_weight_dict,
          callbacks=[early_stop],
          verbose=1)

# ✅ Predict and evaluate
y_pred_probs = model.predict([X_text_test, X_num_test])
best_f1 = 0
best_thresh = 0.5

for t in np.arange(0.3, 0.71, 0.05):
    y_pred_thresh = (y_pred_probs[:, 1] > t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

y_pred_final = (y_pred_probs[:, 1] > best_thresh).astype(int)

# ✅ Metrics
acc = accuracy_score(y_test, y_pred_final)
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f1 = f1_score(y_test, y_pred_final, average='weighted')
auc_score = roc_auc_score(y_test, y_pred_probs[:, 1])
report = classification_report(y_test, y_pred_final, digits=4)
matrix = confusion_matrix(y_test, y_pred_final)

# ✅ Print results
print("✅ In-Domain Accuracy (Doctor):", acc)
print("✅ Best Threshold:", best_thresh)
print("✅ Precision:", precision)
print("✅ Recall:", recall)
print("✅ F1 Score (weighted):", f1)
print("✅ AUC Score:", auc_score)
print("\n✅ Classification Report:\n", report)
print("✅ Confusion Matrix:\n", matrix)

# ✅ Confusion matrix plot
plt.figure(figsize=(6, 4))
sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()

# ✅ ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs[:, 1])
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.4f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import seaborn as sns
sent_i = SentimentIntensityAnalyzer()

def vadar_sentiment(text):
    """ Calculate and return the nltk vadar (lexicon method) sentiment """
    return sent_i.polarity_scores(text)['compound']

# create new column for vadar compound sentiment score
df['vadar compound'] = df.iloc[:,6].apply(vadar_sentiment)

def categorise_sentiment(sentiment, neg_threshold=-0.05, pos_threshold=0.05):
    """ categorise the sentiment value as positive (1), negative (-1)
        or neutral (0) based on given thresholds """
    if sentiment < neg_threshold:
        label = 'negative'
    elif sentiment > pos_threshold:
        label = 'positive'
    else:
        label = 'neutral'
    return label

# new col with vadar sentiment label based on vadar compound score
df['vadar sentiment'] = df['vadar compound'].apply(categorise_sentiment)