## Both Models classifying

In [3]:
import pandas as pd
import json
import pickle
import numpy as np
import os

def load_models_and_vectorizers(selection):
    """
    Load the main and secondary models and preprocessing objects.
    """
    with open(f'my_env/two_stage/models_1/all_data_model_{selection}.pkl', 'rb') as model_file:
        main_model = pickle.load(model_file)

    with open(f'my_env/two_stage/models_1/all_data_tfidf_vectorizer_{selection}.pkl', 'rb') as vectorizer_file:
        main_vectorizer = pickle.load(vectorizer_file)

    with open(f'my_env/two_stage/models_1/all_data_label_map_{selection}.pkl', 'rb') as label_map_file:
        main_label_map = pickle.load(label_map_file)

    with open(f'my_env/two_stage/models_1/nee_cp_model_{selection}.pkl', 'rb') as secondary_model_file:
        secondary_model = pickle.load(secondary_model_file)

    with open(f'my_env/two_stage/models_1/nee_cptfidf_vectorizer_{selection}.pkl', 'rb') as secondary_vectorizer_file:
        secondary_vectorizer = pickle.load(secondary_vectorizer_file)

    with open(f'my_env/two_stage/models_1/nee_cplabel_map_{selection}.pkl', 'rb') as secondary_label_map_file:
        secondary_label_map = pickle.load(secondary_label_map_file)

    return main_model, main_vectorizer, main_label_map, secondary_model, secondary_vectorizer, secondary_label_map

def preprocess_claim(claim, vectorizer):
    """
    Preprocess claims using the respective vectorizers.
    """
    return vectorizer.transform([claim]).toarray()

def classify_main(claim, vectorizer, model):
    """
    Classify claims using the main model.
    """
    features = preprocess_claim(claim, vectorizer)
    prediction = model.predict(features)
    probability_distribution = model.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

def classify_secondary(claim, vectorizer, model):
    """
    Classify claims using the secondary model.
    """
    features = preprocess_claim(claim, vectorizer)
    prediction = model.predict(features)
    probability_distribution = model.predict_proba(features)
    return prediction[0], probability_distribution[0]

def predict_single_claim(claim, main_model, main_vectorizer, main_label_map_inverse, secondary_model, secondary_vectorizer, secondary_label_map_inverse):
    """
    Two-step classification process for a single claim.
    """
    predicted_label, confidence, probability_distribution = classify_main(claim, main_vectorizer, main_model)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.40:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim, secondary_vectorizer, secondary_model)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities, confidence

def predict_dataset(input_file_path, output_file_path, main_model, main_vectorizer, main_label_map_inverse, secondary_model, secondary_vectorizer, secondary_label_map_inverse):
    """
    Function to iterate through the dataset and predict labels.
    """
    df = pd.read_json(input_file_path)
    results = []

    for _, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities, confidence = predict_single_claim(claim, main_model, main_vectorizer, main_label_map_inverse, secondary_model, secondary_vectorizer, secondary_label_map_inverse)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'confidence': confidence,
            'class_probabilities': class_probabilities
        }
        results.append(result)

    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    with open(output_file_path, 'w') as outfile:
        json.dump(results, outfile, indent=4)

def calculate_accuracy_from_json(input_file):
    """
    Function to calculate accuracy per label.
    """
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    correct_predictions = {}
    total_predictions = {}
    
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] for label in total_predictions}
    return accuracy_per_label

def main():
    selection = 'randomforest' # or 'logreg' | defines which model you use
    json_output_filename = f"predicted_claims_both_models{selection}"
    input_file_path = 'my_env/two_stage/data/dev.json' # or 'my_env/two_stage/data/data_test.json' for test set
    output_file_path = f'my_env/two_stage/results/{json_output_filename}.json'

    main_model, main_vectorizer, main_label_map, secondary_model, secondary_vectorizer, secondary_label_map = load_models_and_vectorizers(selection)
    main_label_map_inverse = {v: k for k, v in main_label_map.items()}
    secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}
    predict_dataset(input_file_path, output_file_path, main_model, main_vectorizer, main_label_map_inverse, secondary_model, secondary_vectorizer, secondary_label_map_inverse)
    print(f"Predicted labels written to {output_file_path}")
    accuracy_per_label = calculate_accuracy_from_json(output_file_path)
    for label, accuracy in accuracy_per_label.items():
        print(f"Accuracy for '{label}': {accuracy:.2f}")

if __name__ == "__main__":
    main()


Predicted labels written to my_env/two_stage/results/predicted_claims_both_modelsrandomforest.json
Accuracy for 'Refuted': 0.31
Accuracy for 'Supported': 0.20
Accuracy for 'Not Enough Evidence': 0.46
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.34


## Both Models on a single claim

In [1]:
import pandas as pd
import json
import pickle
import numpy as np

def load_models_and_vectorizers(selection):
    """
    Load the main and secondary models and preprocessing objects.
    """
    with open(f'my_env/two_stage/models_1/all_data_model_{selection}.pkl', 'rb') as model_file:
        main_model = pickle.load(model_file)

    with open(f'my_env/two_stage/models_1/all_data_tfidf_vectorizer_{selection}.pkl', 'rb') as vectorizer_file:
        main_vectorizer = pickle.load(vectorizer_file)

    with open(f'my_env/two_stage/models_1/all_data_label_map_{selection}.pkl', 'rb') as label_map_file:
        main_label_map = pickle.load(label_map_file)

    with open(f'my_env/two_stage/models_1/nee_cp_model_{selection}.pkl', 'rb') as secondary_model_file:
        secondary_model = pickle.load(secondary_model_file)

    with open(f'my_env/two_stage/models_1/nee_cptfidf_vectorizer_{selection}.pkl', 'rb') as secondary_vectorizer_file:
        secondary_vectorizer = pickle.load(secondary_vectorizer_file)

    with open(f'my_env/two_stage/models_1/nee_cplabel_map_{selection}.pkl', 'rb') as secondary_label_map_file:
        secondary_label_map = pickle.load(secondary_label_map_file)

    return main_model, main_vectorizer, main_label_map, secondary_model, secondary_vectorizer, secondary_label_map

def preprocess_claim(claim, vectorizer):
    """
    Preprocess claims using the respective vectorizers.
    """
    return vectorizer.transform([claim]).toarray()

def classify_main(claim, vectorizer, model):
    """
    Classify claims using the main model.
    """
    features = preprocess_claim(claim, vectorizer)
    prediction = model.predict(features)
    probability_distribution = model.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

def classify_secondary(claim, vectorizer, model):
    """
    Classify claims using the secondary model.
    """
    features = preprocess_claim(claim, vectorizer)
    prediction = model.predict(features)
    probability_distribution = model.predict_proba(features)
    return prediction[0], probability_distribution[0]

def predict_single_claim(claim, main_model, main_vectorizer, main_label_map_inverse, secondary_model, secondary_vectorizer, secondary_label_map_inverse):
    """
    Two-step classification process for a single claim.
    """
    predicted_label, confidence, probability_distribution = classify_main(claim, main_vectorizer, main_model)
    
    if main_label_map_inverse[predicted_label] in ['Supported', 'Refuted'] and confidence >= 0.60:
        # High confidence in main model's prediction
        predicted_label_name = main_label_map_inverse[predicted_label]
        class_probabilities = {main_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}
    else:
        # Use secondary model for further classification
        predicted_label, probability_distribution = classify_secondary(claim, secondary_vectorizer, secondary_model)
        predicted_label_name = secondary_label_map_inverse[predicted_label]
        class_probabilities = {secondary_label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities, confidence

def main():

    single_claim = "Donald Trump said that $15 an hour is too much for essential workers"

    selection = 'randomforest'  # or 'logreg' | defines which model you use

    main_model, main_vectorizer, main_label_map, secondary_model, secondary_vectorizer, secondary_label_map = load_models_and_vectorizers(selection)
    main_label_map_inverse = {v: k for k, v in main_label_map.items()}
    secondary_label_map_inverse = {v: k for k, v in secondary_label_map.items()}

    
    predicted_label, class_probabilities, confidence = predict_single_claim(single_claim, main_model, main_vectorizer, main_label_map_inverse, secondary_model, secondary_vectorizer, secondary_label_map_inverse)
    
    print(f"Predicted Label: {predicted_label} | Label Confidence: {confidence}")
    print(f"Class Probabilities: {class_probabilities}")

if __name__ == "__main__":
    main()


Predicted Label: Conflicting Evidence/Cherrypicking | Label Confidence: 0.40283489805794537
Class Probabilities: {'Conflicting Evidence/Cherrypicking': 0.6008023586993227, 'Not Enough Evidence': 0.3991976413006784}


## Only Secondary Model classifying

might be useful if claims are narrowed down enough from Support / Refute

In [4]:
import pandas as pd
import json
import pickle
import numpy as np
import os

def load_model_and_vectorizer(selection):
    """
    Load the secondary model and preprocessing objects.
    """
    with open(f'my_env/two_stage/models_1/nee_cp_model_{selection}.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

    with open(f'my_env/two_stage/models_1/nee_cptfidf_vectorizer_{selection}.pkl', 'rb') as vectorizer_file:
        vectorizer = pickle.load(vectorizer_file)

    with open(f'my_env/two_stage/models_1/nee_cplabel_map_{selection}.pkl', 'rb') as label_map_file:
        label_map = pickle.load(label_map_file)

    return model, vectorizer, label_map

def preprocess_claim(claim, vectorizer):
    """
    Preprocess claims using the respective vectorizers.
    """
    return vectorizer.transform([claim]).toarray()

def classify_secondary(claim, vectorizer, model):
    """
    Classify claims using the secondary model.
    """
    features = preprocess_claim(claim, vectorizer)
    prediction = model.predict(features)
    probability_distribution = model.predict_proba(features)
    confidence = np.max(probability_distribution)
    return prediction[0], confidence, probability_distribution[0]

def predict_single_claim(claim, vectorizer, model, label_map_inverse):
    """
    Two-step classification process for a single claim.
    """
    predicted_label, confidence, probability_distribution = classify_secondary(claim, vectorizer, model)
    predicted_label_name = label_map_inverse[predicted_label]
    class_probabilities = {label_map_inverse[i]: prob for i, prob in enumerate(probability_distribution)}

    return predicted_label_name, class_probabilities, confidence

def predict_dataset(input_file_path, output_file_path, vectorizer, model, label_map_inverse):
    """
    Function to iterate through the dataset and predict labels.
    """
    df = pd.read_json(input_file_path)
    results = []

    for _, row in df.iterrows():
        claim = row['claim']
        actual_label = row['label']
        predicted_label, class_probabilities, confidence = predict_single_claim(claim, vectorizer, model, label_map_inverse)
        result = {
            'claim': claim,
            'actual_label': actual_label,
            'predicted_label': predicted_label,
            'confidence': confidence,
            'class_probabilities': class_probabilities
        }
        results.append(result)
    
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    with open(output_file_path, 'w') as outfile:
        json.dump(results, outfile, indent=4)

def calculate_accuracy_from_json(input_file):
    """
    Function to calculate accuracy per label.
    """
    with open(input_file, 'r') as file:
        results = json.load(file)
    
    correct_predictions = {}
    total_predictions = {}
    
    for result in results:
        actual_label = result['actual_label']
        predicted_label = result['predicted_label']
        
        if actual_label not in correct_predictions:
            correct_predictions[actual_label] = 0
            total_predictions[actual_label] = 0
            
        if predicted_label == actual_label:
            correct_predictions[actual_label] += 1
        
        total_predictions[actual_label] += 1
    
    accuracy_per_label = {label: correct_predictions[label] / total_predictions[label] for label in total_predictions}
    return accuracy_per_label

def main():
    selection = 'randomforest' # or 'logreg' for faster
    json_output_filename = f"secondary_model_predictions_on_dev{selection}"
    input_file_path = 'my_env/two_stage/data/dev.json' # or 'my_env/two_stage/data/data_test.json' for test set
    output_file_path = f'my_env/two_stage/results/{json_output_filename}.json'
    
    model, vectorizer, label_map = load_model_and_vectorizer(selection)
    label_map_inverse = {v: k for k, v in label_map.items()}
    predict_dataset(input_file_path, output_file_path, vectorizer, model, label_map_inverse)
    print(f"Predicted labels written to {output_file_path}")
    accuracy_per_label = calculate_accuracy_from_json(output_file_path)
    for label, accuracy in accuracy_per_label.items():
        print(f"Accuracy for '{label}': {accuracy:.2f}")

if __name__ == "__main__":
    main()


Predicted labels written to my_env/two_stage/results/secondary_model_predictions_on_devrandomforest.json
Accuracy for 'Refuted': 0.00
Accuracy for 'Supported': 0.00
Accuracy for 'Not Enough Evidence': 0.60
Accuracy for 'Conflicting Evidence/Cherrypicking': 0.50
