In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
df_tfidf = pd.read_csv('tfidf.csv')
df_tfidf.head()

Unnamed: 0,absa,access,account,action,activity,actually,address,administrator,age,alias,...,well.1,wetransfer,window.1,word,world,wrong.1,ximian,year,zzzzteana,label
0,0.0,0.022725,0.009054,0.0,0.014582,0.014001,0.0,0.014705,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.250855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.064209,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.228481,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [16]:
#Input is all the features, and output is label
X_tfidf = df_tfidf.drop('label', axis=1)
y_tfidf = df_tfidf['label']
#spliting dataset in traning and test sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=42)

In [17]:
logreg_tfidf = LogisticRegression(
    penalty= 'l2',
    C= 1.0,
    solver= 'lbfgs',
    max_iter= 100,
    class_weight= None
)
logreg_tfidf.fit(X_train_tfidf, y_train_tfidf)


LogisticRegression()

In [18]:
y_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)

Evaluation

In [19]:
# Evaluation metrics
accuracy = accuracy_score(y_test_tfidf, y_pred_tfidf)
precision = precision_score(y_test_tfidf, y_pred_tfidf)
recall = recall_score(y_test_tfidf, y_pred_tfidf)
f1 = f1_score(y_test_tfidf, y_pred_tfidf)

# ROC-AUC
y_pred_prob = logreg_tfidf.predict_proba(X_test_tfidf)[:, 1]
roc_auc = roc_auc_score(y_test_tfidf, y_pred_prob)

# False Positive Rate (FPR) and False Negative Rate (FNR)
tn, fp, fn, tp = confusion_matrix(y_test_tfidf, y_pred_tfidf).ravel()

    # False Positive Rate (FPR) as percentage
fpr_percentage = (fp / (fp + tn)) * 100

# False Negative Rate (FNR) as percentage
fnr_percentage = (fn / (fn + tp)) * 100

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"AUC-ROC: {roc_auc:.2f}")
print(f"False Positive Rate (FPR): {fpr_percentage:.2f}%")
print(f"False Negative Rate (FNR): {fnr_percentage:.2f}%")

Accuracy: 0.97
Precision: 0.97
Recall: 0.98
F1-score: 0.97
AUC-ROC: 0.99
False Positive Rate (FPR): 3.42%
False Negative Rate (FNR): 1.58%


WORD2VEC

In [20]:
# Assuming df_word2vec is your DataFrame with Word2Vec vectors and labels
df_word2vec = pd.read_csv('word2Vec.csv')

# Input features are Word2Vec vectors, and output is the label
X_word2vec = df_word2vec.drop('label', axis=1)
y_word2vec = df_word2vec['label']

# Split the dataset into training and test sets
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(X_word2vec, y_word2vec, test_size=0.2, random_state=42)

logreg_word2vec = LogisticRegression(    penalty= 'l2',
    C= 1.0,
    solver= 'lbfgs',
    max_iter= 1000,
    class_weight= None)
logreg_word2vec.fit(X_train_word2vec, y_train_word2vec)
y_pred_logreg = logreg_word2vec.predict(X_test_word2vec)

Evaluation

In [21]:
# Evaluation metrics
accuracy = accuracy_score(y_test_word2vec, y_pred_logreg)
precision = precision_score(y_test_word2vec, y_pred_logreg)
recall = recall_score(y_test_word2vec, y_pred_logreg)
f1 = f1_score(y_test_word2vec, y_pred_logreg)

# ROC-AUC
y_pred_prob = logreg_word2vec.predict_proba(X_test_word2vec)[:, 1]
roc_auc = roc_auc_score(y_test_word2vec, y_pred_prob)

# False Positive Rate (FPR) and False Negative Rate (FNR)
tn, fp, fn, tp = confusion_matrix(y_test_word2vec, y_pred_logreg).ravel()

    # False Positive Rate (FPR) as percentage
fpr_percentage = (fp / (fp + tn)) * 100

# False Negative Rate (FNR) as percentage
fnr_percentage = (fn / (fn + tp)) * 100

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"AUC-ROC: {roc_auc:.2f}")
print(f"False Positive Rate (FPR): {fpr_percentage:.2f}%")
print(f"False Negative Rate (FNR): {fnr_percentage:.2f}%")

Accuracy: 0.93
Precision: 0.92
Recall: 0.93
F1-score: 0.93
AUC-ROC: 0.98
False Positive Rate (FPR): 8.05%
False Negative Rate (FNR): 6.69%


Custom Features

In [28]:
df_tfidf = pd.read_csv('custom_features.csv')
#Input is all the features, and output is label
X_tfidf = df_tfidf.drop('label', axis=1)
y_tfidf = df_tfidf['label']
#spliting dataset in traning and test sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=42)
scaler = MinMaxScaler()

# Fit and transform the features for both training and test sets
X_train_scaled = scaler.fit_transform(X_train_tfidf)
X_test_scaled = scaler.transform(X_test_tfidf)
logreg_tfidf = LogisticRegression(    
    penalty= 'l2',
    C= 10.0,
    solver= 'saga',
    max_iter= 1000,
    class_weight= 'balanced')
logreg_tfidf.fit(X_train_scaled, y_train_tfidf)
y_pred_tfidf = logreg_tfidf.predict(X_test_scaled)

# Evaluation metrics
accuracy = accuracy_score(y_test_tfidf, y_pred_tfidf)
precision = precision_score(y_test_tfidf, y_pred_tfidf)
recall = recall_score(y_test_tfidf, y_pred_tfidf)
f1 = f1_score(y_test_tfidf, y_pred_tfidf)

# ROC-AUC
y_pred_prob = logreg_tfidf.predict_proba(X_test_scaled)[:, 1]
roc_auc = roc_auc_score(y_test_tfidf, y_pred_prob)

# False Positive Rate (FPR) and False Negative Rate (FNR)
tn, fp, fn, tp = confusion_matrix(y_test_tfidf, y_pred_tfidf).ravel()

    # False Positive Rate (FPR) as percentage
fpr_percentage = (fp / (fp + tn)) * 100

# False Negative Rate (FNR) as percentage
fnr_percentage = (fn / (fn + tp)) * 100

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"AUC-ROC: {roc_auc:.2f}")
print(f"False Positive Rate (FPR): {fpr_percentage:.2f}%")
print(f"False Negative Rate (FNR): {fnr_percentage:.2f}%")

Accuracy: 0.81
Precision: 0.82
Recall: 0.78
F1-score: 0.80
AUC-ROC: 0.89
False Positive Rate (FPR): 17.12%
False Negative Rate (FNR): 21.65%
