## Train on all data V1

In [44]:
import pandas as pd
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load the JSON dataset
def load_json_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Load and preprocess the dataset
file_path = 'my_env/data/train.json'  # Replace with your actual file path for the dataset

df = load_json_dataset(file_path)

# Encode labels
label_map = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_map)

# Extract features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['claim']).toarray()
y = df['label']

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Using optimized parameters for RandomForest
best_rf = RandomForestClassifier(
    max_depth=None,  # Limited depth
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200,  # Reduced number of trees
    class_weight='balanced',
    n_jobs=-1  # Use all available cores
)

# Train the model with the optimized parameters
best_rf.fit(X_resampled, y_resampled)

# Save the model and preprocessing objects
with open('all_data_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

with open('all_data_tfidf_vectorizer.pkl', 'wb') as tfidf_file:
    pickle.dump(vectorizer, tfidf_file)

with open('all_data_label_map.pkl', 'wb') as label_map_file:
    pickle.dump(label_map, label_map_file)

print("Model and preprocessing objects saved.")


Model and preprocessing objects saved.


## Train on all data V2
removed SMOTE

In [1]:
import pandas as pd
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load the JSON dataset
def load_json_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Load and preprocess the dataset
file_path = 'my_env/data/train.json'  # Replace with your actual file path for the dataset

df = load_json_dataset(file_path)

# Encode labels
label_map = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_map)

# Extract features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['claim']).toarray()
y = df['label']

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Using optimized parameters for RandomForest
best_rf = RandomForestClassifier(
    max_depth=None,  # Limited depth
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200,  # Reduced number of trees
    class_weight='balanced',
    n_jobs=-1  # Use all available cores
)

# Train the model with the optimized parameters
best_rf.fit(X, y)

# Save the model and preprocessing objects
version = "V2"

with open(f'my_env/tf-idf_data_and_models/models/all_data_model_{version}.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer_{version}.pkl', 'wb') as tfidf_file:
    pickle.dump(vectorizer, tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_label_map_{version}.pkl', 'wb') as label_map_file:
    pickle.dump(label_map, label_map_file)

print("Model and preprocessing objects saved.")


Model and preprocessing objects saved.


## Model trained on NEE / CP V1

In [None]:
import pandas as pd
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load the JSON dataset
def load_json_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Load and preprocess the dataset
file_path = 'my_env/data/nee_cp_from_train.json'  # Replace with your actual file path for the dataset

df = load_json_dataset(file_path)

# Encode labels
label_map = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_map)

# Extract features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['claim']).toarray()
y = df['label']

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Using optimized parameters for RandomForest
best_rf = RandomForestClassifier(
    max_depth=None,  # Limited depth
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200,  # Reduced number of trees
    class_weight='balanced',
    n_jobs=-1  # Use all available cores
)

# Train the model with the optimized parameters
best_rf.fit(X_resampled, y_resampled)

# Save the model and preprocessing objects
version = ""
with open(f'my_env/tf-idf_data_and_models/models/nee_cp_model_{version}.pkl', 'wb') as secondary_model_file:
    pickle.dump(best_rf, secondary_model_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer_{version}.pkl', 'wb') as secondary_tfidf_file:
    pickle.dump(vectorizer, secondary_tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cplabel_map_{version}.pkl', 'wb') as secondary_label_map_file:
    pickle.dump(label_map, secondary_label_map_file)

print("Model and preprocessing objects saved.")


## Model trained on NEE / CP V2
remove SMOTE

In [None]:
import pandas as pd
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load the JSON dataset
def load_json_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Load and preprocess the dataset
file_path = 'my_env/data/nee_cp_from_train.json'  # Replace with your actual file path for the dataset

df = load_json_dataset(file_path)

# Encode labels
label_map = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_map)

# Extract features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['claim']).toarray()
y = df['label']

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Using optimized parameters for RandomForest
best_rf = RandomForestClassifier(
    max_depth=None,  # Limited depth
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200,  # Reduced number of trees
    class_weight='balanced',
    n_jobs=-1  # Use all available cores
)

# Train the model with the optimized parameters
best_rf.fit(X, y)

# Save the model and preprocessing objects
version = "V2"
with open(f'my_env/tf-idf_data_and_models/models/nee_cp_model_{version}.pkl', 'wb') as secondary_model_file:
    pickle.dump(best_rf, secondary_model_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer_{version}.pkl', 'wb') as secondary_tfidf_file:
    pickle.dump(vectorizer, secondary_tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cplabel_map_{version}.pkl', 'wb') as secondary_label_map_file:
    pickle.dump(label_map, secondary_label_map_file)

print("Model and preprocessing objects saved.")


## Classification & Accuracy Report *not accurate

accuracy report is weird, switch to new method 

In [24]:
import pandas as pd
import json
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import numpy as np

# Load the main model and preprocessing objects
with open('all_data_model.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open('all_data_tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open('all_data_label_map.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open('nee_cp_model.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open('nee_cptfidf_vectorizer.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open('nee_cplabel_map.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    return prediction[0]

# Load the JSON dataset
def load_json_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Function to calculate accuracy per label
def calculate_combined_accuracy(df):
    correct_predictions = {label: 0 for label in main_label_map.values()}
    correct_predictions.update({label: 0 for label in secondary_label_map.values()})
    total_predictions = {label: 0 for label in main_label_map.values()}
    total_predictions.update({label: 0 for label in secondary_label_map.values()})
    
    y_true = []
    y_pred = []
    
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, confidence = classify_main(claim)
        
        if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.60:
            y_true.append(actual_label)
            y_pred.append(predicted_label)
            total_predictions[actual_label] += 1
            if predicted_label == actual_label:
                correct_predictions[actual_label] += 1
        else:
            secondary_predicted_label = classify_secondary(claim)
            # Ensure the secondary predicted label is mapped correctly
            secondary_predicted_label_mapped = list(main_label_map.values()).index(secondary_predicted_label + 2)
            y_true.append(actual_label)
            y_pred.append(secondary_predicted_label_mapped)
            total_predictions[actual_label] += 1
            if secondary_predicted_label_mapped == actual_label:
                correct_predictions[actual_label] += 1
    
    labels = list(main_label_map.values())[:2] + [v + 2 for v in secondary_label_map.values()]
    target_names = list(main_label_map_inverse.values())[:2] + list(secondary_label_map_inverse.values())
    
    accuracy_per_label = classification_report(y_true, y_pred, labels=labels, target_names=target_names)
    
    accuracy_dict = {main_label_map_inverse[label]: correct_predictions[label] / total_predictions[label] if total_predictions[label] > 0 else 0 for label in main_label_map.values()}
    accuracy_dict.update({secondary_label_map_inverse[label]: correct_predictions[label] / total_predictions[label] if total_predictions[label] > 0 else 0 for label in secondary_label_map.values()})
    
    return accuracy_per_label, accuracy_dict

# Load and preprocess the dataset
file_path = 'my_env/data/dev.json'  # Replace with your actual file path for the dataset
df = load_json_dataset(file_path)

# Encode labels in the dataframe
df['label'] = df['label'].map(main_label_map).fillna(df['label'].map(secondary_label_map))

# Calculate combined accuracy per label
combined_accuracy_per_label, accuracy_dict = calculate_combined_accuracy(df)

# Display the results
print(combined_accuracy_per_label)
for label, accuracy in accuracy_dict.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


                                    precision    recall  f1-score   support

                         Supported       0.92      0.09      0.16       122
                           Refuted       0.68      0.60      0.64       305
Conflicting Evidence/Cherrypicking       0.18      0.39      0.24        38
               Not Enough Evidence       0.18      0.69      0.28        35

                          accuracy                           0.47       500
                         macro avg       0.49      0.44      0.33       500
                      weighted avg       0.67      0.47      0.47       500

Accuracy for 'Supported': 0.09
Accuracy for 'Refuted': 0.60
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.09
Accuracy for 'Not Enough Evidence': 0.60


## Both Models on Dev set V1

In [None]:
import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/all_data_model.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open('my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open('my_env/tf-idf_data_and_models/models/all_data_label_map.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/nee_cp_model.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open('my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open('my_env/tf-idf_data_and_models/models/nee_cplabel_map.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.75:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
input_file_path = 'my_env/data/dev.json'  # Replace with your actual file path

json_output_filename = "predicted_claims_both_models"
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


## Both Models on Dev set V2

smote removed from both models, no other changes

In [35]:
version = '_V2'

import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
with open(f'my_env/tf-idf_data_and_models/models/all_data_model{version}.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer{version}.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_label_map{version}.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open(f'my_env/tf-idf_data_and_models/models/nee_cp_model{version}.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer{version}.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cplabel_map{version}.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.65:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'confidence': confidence,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
input_file_path = 'my_env/data/dev.json'  # Replace with your actual file path

json_output_filename = f"predicted_claims_both_models{version}"
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Predicted labels written to my_env/tf-idf_data_and_models/json_outputs/predicted_claims_both_models_V2.json
Accuracy for 'Refuted': 0.46
Accuracy for 'Supported': 0.11
Accuracy for 'Not Enough Evidence': 0.80
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.39


## Both Models on just Not enough evidence & conflicting evidence V1
## * *insanity check* *

In [47]:
import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/all_data_model.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open('my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open('my_env/tf-idf_data_and_models/models/all_data_label_map.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/nee_cp_model.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open('my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open('my_env/tf-idf_data_and_models/models/nee_cplabel_map.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.7:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
input_file_path = 'my_env/data/not_enough_info_from_dev.json'  # Replace with your actual file path

json_output_filename = "predicted_claims_both_models_just_nee_cp_claims"
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Predicted labels written to my_env/tf-idf_data_and_models/json_outputs/predicted_claims_both_models_just_nee_cp_claims.json
Accuracy for 'Not Enough Evidence': 0.74
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.42


## Just Primary Model on dev_data V1

In [25]:
import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/all_data_model.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open('my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open('my_env/tf-idf_data_and_models/models/all_data_label_map.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
#with open('nee_cp_model.pkl', 'rb') as secondary_model_file:
    #secondary_rf = pickle.load(secondary_model_file)

#with open('nee_cptfidf_vectorizer.pkl', 'rb') as secondary_tfidf_file:
    #secondary_vectorizer = pickle.load(secondary_tfidf_file)

#with open('nee_cplabel_map.pkl', 'rb') as secondary_label_map_file:
    #secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
#secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
#def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    #main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.7:
        # High confidence in main model's prediction
    predicted_label_name = main_label_map_inverse[predicted_label]
    class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    #else:
        # Use secondary model for further classification
       # predicted_label, probability_distribution = classify_secondary(claim)
       # predicted_label_name = secondary_label_map_inverse[predicted_label]
       # class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
json_output_filename = "predicted_claims_primary_model_V1"
input_file_path = 'my_env/data/dev.json'  # Replace with your actual file path
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'   # The JSON file from the previous step
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Predicted labels written to my_env/json_outputs/predicted_claims_primary_model.json
Accuracy for 'Refuted': 0.98
Accuracy for 'Supported': 0.24
Accuracy for 'Not Enough Evidence': 0.11
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.00


## Just Primary Model on dev_data V2

In [15]:
import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/all_data_model_V2.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open('my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer_V2.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open('my_env/tf-idf_data_and_models/models/all_data_label_map_V2.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
#with open('nee_cp_model.pkl', 'rb') as secondary_model_file:
    #secondary_rf = pickle.load(secondary_model_file)

#with open('nee_cptfidf_vectorizer.pkl', 'rb') as secondary_tfidf_file:
    #secondary_vectorizer = pickle.load(secondary_tfidf_file)

#with open('nee_cplabel_map.pkl', 'rb') as secondary_label_map_file:
    #secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
#secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
#def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    #main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.7:
        # High confidence in main model's prediction
    predicted_label_name = main_label_map_inverse[predicted_label]
    class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    #else:
        # Use secondary model for further classification
       # predicted_label, probability_distribution = classify_secondary(claim)
       # predicted_label_name = secondary_label_map_inverse[predicted_label]
       # class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
json_output_filename = "predicted_claims_primary_modelV2"
input_file_path = 'my_env/data/dev.json'  # Replace with your actual file path
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'   # The JSON file from the previous step
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Predicted labels written to my_env/tf-idf_data_and_models/json_outputs/predicted_claims_primary_modelV2.json
Accuracy for 'Refuted': 0.98
Accuracy for 'Supported': 0.23
Accuracy for 'Not Enough Evidence': 0.09
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.00


## just Secondary Model on dev_data V1

In [24]:
import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
#with open('all_data_model.pkl', 'rb') as model_file:
    #main_rf = pickle.load(model_file)

#with open('all_data_tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    #main_vectorizer = pickle.load(tfidf_file)

#with open('all_data_label_map.pkl', 'rb') as label_map_file:
    #main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/nee_cp_model.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open('my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open('my_env/tf-idf_data_and_models/models/nee_cplabel_map.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
#main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
#def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    #predicted_label, confidence, probability_distribution = classify_main(claim)
    # Use secondary model for further classification
    predicted_label, probability_distribution = classify_secondary(claim)
    predicted_label_name = secondary_label_map_inverse[predicted_label]
    class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
input_file_path = 'my_env/data/dev.json'  # Replace with your actual file path

json_output_filename = "predicted_claims_secondary_model"
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'  # The JSON file from the previous step
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Predicted labels written to my_env/json_outputs/predicted_claims_secondary_model.json
Accuracy for 'Refuted': 0.00
Accuracy for 'Supported': 0.00
Accuracy for 'Not Enough Evidence': 0.80
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.45


## just Secondary Model on dev_data V2

removed SMOTE

In [28]:
import pandas as pd
import json
import pickle
import numpy as np

# Load the main model and preprocessing objects
#with open('all_data_model.pkl', 'rb') as model_file:
    #main_rf = pickle.load(model_file)

#with open('all_data_tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    #main_vectorizer = pickle.load(tfidf_file)

#with open('all_data_label_map.pkl', 'rb') as label_map_file:
    #main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open('my_env/tf-idf_data_and_models/models/nee_cp_model_V2.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open('my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer_V2.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open('my_env/tf-idf_data_and_models/models/nee_cplabel_map_V2.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
#main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
#def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    #predicted_label, confidence, probability_distribution = classify_main(claim)
    # Use secondary model for further classification
    predicted_label, probability_distribution = classify_secondary(claim)
    predicted_label_name = secondary_label_map_inverse[predicted_label]
    class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Example usage
input_file_path = 'my_env/data/dev.json'  # Replace with your actual file path

json_output_filename = "predicted_claims_secondary_model_V2"
output_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'
predict_dataset(input_file_path, output_file_path)
print(f"Predicted labels written to {output_file_path}")


## check accuracies

import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Example usage
input_file_path = f'my_env/tf-idf_data_and_models/json_outputs/{json_output_filename}.json'  # The JSON file from the previous step
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Predicted labels written to my_env/tf-idf_data_and_models/json_outputs/predicted_claims_secondary_model_V2.json
Accuracy for 'Refuted': 0.00
Accuracy for 'Supported': 0.00
Accuracy for 'Not Enough Evidence': 0.86
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.53


## Accuracy score based on labels in predicted_labels.json

In [17]:
import json

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label
filename = ''
# Example usage
input_file_path = f'{filename}.json'  # The JSON file from the previous step
accuracy_per_label = calculate_accuracy_from_json(input_file_path)

# Display the results
for label, accuracy in accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")



Accuracy for 'Refuted': 0.00
Accuracy for 'Supported': 0.00
Accuracy for 'Not Enough Evidence': 0.80
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.45


## check single claim

In [34]:
version = '_V2'

import pandas as pd
import json
import pickle
import numpy as np


# Load the main model and preprocessing objects
with open(f'my_env/tf-idf_data_and_models/models/all_data_model{version}.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer{version}.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_label_map{version}.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open(f'my_env/tf-idf_data_and_models/models/nee_cp_model{version}.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer{version}.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cplabel_map{version}.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.65:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities, confidence

# Example usage
single_claim = "Donald Trump said that $15 an hour is too much for essential workers"

predicted_label, class_probabilities, confidence = predict_single_claim(single_claim)
print(f"Predicted Label: {predicted_label} | Label Confidence: {confidence}")
print(f"Class Probabilities: {class_probabilities}")


Predicted Label: Conflicting Evidence/Cherrypicking | Label Confidence: 0.5638047623544493
Class Probabilities: {'Conflicting Evidence/Cherrypicking': 0.6793978972469596, 'Not Enough Evidence': 0.32060210275304035}


## find optimal confidence level

In [8]:
version = '_V2'

import pandas as pd
import json
import pickle
import numpy as np
from sklearn.metrics import accuracy_score

# Load the main model and preprocessing objects
with open(f'my_env/tf-idf_data_and_models/models/all_data_model{version}.pkl', 'rb') as model_file:
    main_rf = pickle.load(model_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_tfidf_vectorizer{version}.pkl', 'rb') as tfidf_file:
    main_vectorizer = pickle.load(tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/all_data_label_map{version}.pkl', 'rb') as label_map_file:
    main_label_map = pickle.load(label_map_file)

# Load the secondary model and preprocessing objects
with open(f'my_env/tf-idf_data_and_models/models/nee_cp_model{version}.pkl', 'rb') as secondary_model_file:
    secondary_rf = pickle.load(secondary_model_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cptfidf_vectorizer{version}.pkl', 'rb') as secondary_tfidf_file:
    secondary_vectorizer = pickle.load(secondary_tfidf_file)

with open(f'my_env/tf-idf_data_and_models/models/nee_cplabel_map{version}.pkl', 'rb') as secondary_label_map_file:
    secondary_label_map = pickle.load(secondary_label_map_file)

# Inverse label maps
main_label_map_inverse = {v: k for k, v in main_label_map.items()}
secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

# Preprocess claims using the respective vectorizers
def preprocess_claim(claim, vectorizer):
    return vectorizer.transform([claim]).toarray()

# First step: Classify claims using the main model
def classify_main(claim):
    features = preprocess_claim(claim, main_vectorizer)
    prediction = main_rf.predict(features)
    probability_distribution = main_rf.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

# Second step: Classify claims using the secondary model
def classify_secondary(claim):
    features = preprocess_claim(claim, secondary_vectorizer)
    prediction = secondary_rf.predict(features)
    probability_distribution = secondary_rf.predict_proba(features)
    return prediction[0], probability_distribution[0]

# Two-step classification process for a single claim
def predict_single_claim(claim, confidence_threshold):
    predicted_label, confidence, probability_distribution = classify_main(claim)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= confidence_threshold:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities

# Function to iterate through the dataset and predict labels
def predict_dataset(file_path, output_file, confidence_threshold):
    # Load the dataset
    df = pd.read_json(file_path)

    # Initialize results list
    results = []

    # Iterate through each claim in the dataset
    for index, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities = predict_single_claim(claim, confidence_threshold)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    # Write the results to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Function to calculate accuracy per label
def calculate_accuracy_from_json(input_file):
    # Load the results from the JSON file
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    # Initialize counters for correct predictions and total predictions per label
    correct_predictions = {}
    total_predictions = {}
    
    # Iterate through the results and update counters
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    # Calculate accuracy per label
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] 
                          for label in total_predictions}
    
    return accuracy_per_label

# Function to find the optimal confidence threshold
def find_optimal_threshold(input_file, output_file):
    thresholds = np.arange(0.3, 1.0, 0.05)
    best_threshold = 0.3
    best_accuracy = 0.0
    best_accuracy_per_label = {}

    for threshold in thresholds:
        predict_dataset(input_file, output_file, threshold)
        accuracy_per_label = calculate_accuracy_from_json(output_file)
        overall_accuracy = sum(accuracy_per_label.values()) / len(accuracy_per_label)
        
        print(f"Threshold: {threshold:.2f}, Overall Accuracy: {overall_accuracy:.2f}")
        for label, accuracy in accuracy_per_label.items():
            print(f"  Label: {label}, Accuracy: {accuracy:.2f}")
            
        if overall_accuracy > best_accuracy:
            best_accuracy = overall_accuracy
            print(best_accuracy)
            best_threshold = threshold
            print(best_threshold)
            best_accuracy_per_label = accuracy_per_label
    
    return best_threshold, best_accuracy, best_accuracy_per_label

# Example usage to find the optimal threshold
input_file_path = 'my_env/data/not_enough_info_from_dev.json'  # Replace with your actual file path
output_file_path = f'find_optimal_confidence{version}.json'

best_threshold, best_accuracy, best_accuracy_per_label = find_optimal_threshold(input_file_path, output_file_path)

print(f"Best Confidence Threshold: {best_threshold}")
print(f"Best Overall Accuracy: {best_accuracy:.2f}")
for label, accuracy in best_accuracy_per_label.items():
    print(f"Accuracy for '{label}': {accuracy:.2f}")


Threshold: 0.30, Overall Accuracy: 0.04
  Label: Not Enough Evidence, Accuracy: 0.09
  Label: Conflicting Evidence/Cherrypicking, Accuracy: 0.00
0.04285714285714286
0.3
Threshold: 0.35, Overall Accuracy: 0.04
  Label: Not Enough Evidence, Accuracy: 0.09
  Label: Conflicting Evidence/Cherrypicking, Accuracy: 0.00
Threshold: 0.40, Overall Accuracy: 0.11
  Label: Not Enough Evidence, Accuracy: 0.23
  Label: Conflicting Evidence/Cherrypicking, Accuracy: 0.00
0.11428571428571428
0.39999999999999997
Threshold: 0.45, Overall Accuracy: 0.19
  Label: Not Enough Evidence, Accuracy: 0.37
  Label: Conflicting Evidence/Cherrypicking, Accuracy: 0.00
0.18571428571428572
0.44999999999999996
Threshold: 0.50, Overall Accuracy: 0.25
  Label: Not Enough Evidence, Accuracy: 0.46
  Label: Conflicting Evidence/Cherrypicking, Accuracy: 0.05
0.2548872180451128
0.49999999999999994
Threshold: 0.55, Overall Accuracy: 0.41
  Label: Not Enough Evidence, Accuracy: 0.66
  Label: Conflicting Evidence/Cherrypicking, Ac