LLM and Random Forest

In [19]:
import json
import re
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import classification_report

# Load LLM Predictions
def load_llm_predictions(file_path):
    detected_classes = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            match = re.search(r'\{.*?\}', line)  # Extract JSON-like content
            if match:
                try:
                    data = json.loads(match.group())
                    detected_classes.append(data.get("detected_classes", []))
                except json.JSONDecodeError:
                    continue
    return detected_classes

# Generate Features from Predictions
def create_feature_dataframe(detected_classes):
    df = pd.DataFrame({"llm_prediction": detected_classes})
    df.fillna("unknown", inplace=True)
    
    # One-hot encode detected classes
    mlb = MultiLabelBinarizer()
    class_features = pd.DataFrame(mlb.fit_transform(df["llm_prediction"]), columns=mlb.classes_)
    df = df.join(class_features)
    
    # Additional Feature Engineering
    df["num_classes_detected"] = df["llm_prediction"].apply(len)
    return df.drop(columns=["llm_prediction"]), mlb

# Load Data
file_path = "/home/aagisha/Documents/PhD Albert/CodingPhDStart1/Bachelor Projects HSAA/PIIDETECTION/Personal-Detection/GPT/gpt_predictions/dessi-mf_results.txt"
detected_classes = load_llm_predictions(file_path)

# Convert to Features
X, mlb = create_feature_dataframe(detected_classes)

# Simulate Labels (for training purposes, replace with real labels if available)
y = (X["full_name"] | X["email"] | X["phone_number"] | X["credit_card_number"]).astype(int)  # Personal Data Indicator

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf.predict(X_test_scaled)

# Evaluation
print(classification_report(y_test, y_pred))

# Function to Enhance LLM Predictions
def refine_llm_predictions(new_data):
    new_features, _ = create_feature_dataframe(new_data)
    new_features_scaled = scaler.transform(new_features)
    rf_predictions = rf.predict(new_features_scaled)
    return ["personal_data" if pred else "non_personal_data" for pred in rf_predictions]

# Apply Refinement
refined_predictions = refine_llm_predictions(detected_classes)
print(refined_predictions[:10])


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

['non_personal_data', 'non_personal_data', 'non_personal_data', 'non_personal_data', 'non_personal_data', 'personal_data', 'personal_data', 'non_personal_data', 'non_personal_data', 'non_personal_data']


In [20]:
len(refined_predictions)

49

In [21]:
len(detected_classes)

49

In [22]:
X.shape

(49, 27)

In [7]:
y

0     0
1     0
2     0
3     0
4     0
5     1
6     1
7     0
8     0
9     0
10    1
11    0
12    1
13    0
14    0
15    0
16    0
17    1
18    1
19    0
20    1
21    0
22    1
23    0
24    1
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    1
33    0
34    0
35    0
36    0
37    1
38    0
39    0
40    0
41    0
42    0
43    0
44    1
45    1
46    0
47    0
48    0
dtype: int64

In [13]:
file_path = "/home/aagisha/Documents/PhD Albert/CodingPhDStart1/Bachelor Projects HSAA/PIIDETECTION/Personal-Detection/GPT/gpt_predictions/dessi-mf_results.txt"

# Read and display all lines
with open(file_path, "r", encoding="utf-8") as f:
    i=0
    for line in f:
        print(line.strip())
        i+=1
    print(i)


```json{detected_classes: ['drink']}```
{detected_classes: ['date']}
{detected_classes: ['city']}
{detected_classes: ['credit_card_provider']}
```json{ "detected_classes": ["political_views"] }```
{detected_classes: ['resolution']}
```json{detected_classes: ['integer_number']}```
{detected_classes: [dish]}
{detected_classes: ['gender']}
{detected_classes: ['iban']}
{detected_classes: [credit_card_provider]}
{detected_classes: ['phone_number']}
{detected_classes: [company]}
{detected_classes: ['phone_model']}
{detected_classes: ['system_quality_attribute']}
{detected_classes: ['religion/worldview']}
{detected_classes: ['religion/worldview']}
{'detected_classes': []}
{detected_classes: ['credit_card_number']}
{detected_classes: ['iban']}
{detected_classes: [cpu]}
{detected_classes: ['integer_number']}
{detected_classes: [answer]}
{detected_classes: ['blood_group']}
{detected_classes: [job]}
{detected_classes: ['credit_card_number']}
{detected_classes: [color]}
{detected_classes: ['full_

In [23]:
import os

# file_path = "dessi-mf_results.txt"

# Get file size in bytes
file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
print(f"File Size: {file_size:.2f} MB")


File Size: 0.02 MB


In [24]:
# file_path = "dessi-mf_results.txt"

# Read and display the first 10 lines
with open(file_path, "r", encoding="utf-8") as f:
    for _ in range(10):
        print(f.readline().strip())



```json{detected_classes: ['drink']}```
{detected_classes: ['date']}
{detected_classes: ['city']}
{detected_classes: ['credit_card_provider']}
```json{ "detected_classes": ["political_views"] }```
{detected_classes: ['resolution']}
```json{detected_classes: ['integer_number']}```
{detected_classes: [dish]}
{detected_classes: ['gender']}


In [25]:

with open(file_path, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)

print(f"Total Lines: {total_lines}")


Total Lines: 547


In [None]:
def create_feature_dataframe_no_aggregation(detected_classes):
    df = pd.DataFrame({"llm_prediction": detected_classes})
    df.fillna("unknown", inplace=True)

    # One-hot encode detected classes while keeping all rows
    mlb = MultiLabelBinarizer(sparse_output=False)
    class_features = mlb.fit_transform(df["llm_prediction"])

    # Convert to DataFrame while ensuring each row is maintained
    class_features_df = pd.DataFrame(class_features, columns=mlb.classes_, index=df.index)

    # Combine with original data (without removing duplicates)
    df = df.join(class_features_df)

    # Add extra feature: How many classes were detected per row
    df["num_classes_detected"] = df["llm_prediction"].apply(len)batchlan


In [27]:
# Load and process data while preserving row count
X_no_agg, mlb = create_feature_dataframe_no_aggregation(detected_classes)

# Check the new shape
print(X_no_agg.shape)  # Should be (547, N) where N is the number of unique categories + 1


(49, 27)


In [28]:
X_no_agg

Unnamed: 0,academic_degree/title,answer,blood_group,city,credit_card_number,date,email,float_number,full_address,full_name,...,phone_number,political_views,programming_language,religion/worldview,resolution,system_quality_attribute,url,user_agent,version,num_classes_detected
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [29]:
mlb

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

# Load Data
file_path = "/home/aagisha/Documents/PhD Albert/CodingPhDStart1/Bachelor Projects HSAA/PIIDETECTION/Personal-Detection/GPT/results_own_datadessimf.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=["Unnamed: 0"], inplace=True)

# Fill missing values in 'Predicted Classes'
df["Predicted Classes"].fillna("unknown", inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df["True Label"] = label_encoder.fit_transform(df["True Label"])  # Encode 'personal' as 1, 'non-personal' as 0
df["Prediction"] = label_encoder.transform(df["Prediction"])
df["Dataset"] = label_encoder.fit_transform(df["Dataset"])
df["Predicted Classes"] = label_encoder.fit_transform(df["Predicted Classes"])

# Feature Engineering (Using Encoded Categorical Variables)
X = df[["Prediction", "Dataset", "Predicted Classes"]]
y = df["True Label"]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)batchlan
rf.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf.predict(X_test_scaled)

# Evaluation
print(classification_report(y_test, y_pred))

# Function to Enhance Detection
def refine_predictions(new_data):
    new_data_scaled = scaler.transform(new_data)  # Directly scale numeric features
    rf_predictions = rf.predict(new_data_scaled)  # Make predictions
    return label_encoder.inverse_transform(rf_predictions)  # Convert back to labels

# Apply Refinement
refined_predictions = refine_predictions(X_test)
print(len(refined_predictions))



              precision    recall  f1-score   support

           0       0.98      1.00      0.99        50
           1       1.00      0.98      0.99        60

    accuracy                           0.99       110
   macro avg       0.99      0.99      0.99       110
weighted avg       0.99      0.99      0.99       110

110


In [6]:
df

Unnamed: 0,Column,True Label,Prediction,Classes,Predicted Classes,Dataset
0,drink_mixed_mimesis,0,0,drink_mixed,18,2
1,MnTvnRO2qGFq56,0,0,date_fr_FR,14,1
2,city_en_mimesis,0,0,city_en,6,2
3,credit_card_provider_de_DE_faker,0,0,credit_card_provider_de_DE,12,1
4,48i2GDrmZhvkUDPqTlaV,1,1,political_views_de,47,2
...,...,...,...,...,...,...
541,Gqi7w4e4pxs,1,1,last_name_mixed,37,2
542,phone_model_fr_mimesis,0,0,phone_model_fr,45,2
543,sex_en_faker,1,1,sex_en,28,1
544,XKANObF4OIs,1,1,worldview_fr,50,2


In [7]:
y

0      0
1      0
2      0
3      0
4      1
      ..
541    1
542    0
543    1
544    1
545    0
Name: True Label, Length: 546, dtype: int64