In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.pipeline import make_pipeline


In [3]:
df = pd.read_csv('/content/Full_data.csv')
print(df.head())
print(df.columns)
print(df.isnull().sum())

                                           Statement  \
0  Yes, in 2023 Congress passed the fewest number...   
1  It’s true: The US is an outlier on paid parent...   
2  Has Wisconsin really had 12 elections since 20...   
3  Yes, many Wisconsin police agencies have two o...   
4  Did the 2020 presidential election in Wisconsi...   

                                       Justification label  \
0  Last year, the 118th Congress passed only 27 b...  true   
1  The U.S. is an outlier on paid maternity and p...  true   
2  As of 2024, Wisconsin has had a dozen close-ca...  true   
3  The Madison Police Department, the Sheboygan S...  true   
4  Godlewski is taking wards together as an avera...  true   

                                              source  \
0  \nMark Pocan, Event at University of Madison-W...   
1  \nState Rep. Lisa Subeck D-Madison, X, Feb. 16...   
2  \nBrian Schimming, GOP press call, March 6, 20...   
3  \nTammy Baldwin, Press call with the Waukesha ...   
4  \nMilwa

In [4]:
df = df.dropna()
df['label'] = df['label'].replace('pants_fire', 'half_false')

In [5]:
# Function to map labels to binary (True/False) classification
def map_to_binary(label):
    if label in ['true', 'mostly_true', 'half_true']:
        return 'True'
    else:
        return 'False'

df['binary_label'] = df['label'].apply(map_to_binary)

In [6]:
print("Binary label distribution:")
print(df['binary_label'].value_counts())
# Features (X) and labels (y) for both binary and multiclass classification
X = df['Statement']
y_multiclass = df['label']
y_binary = df['binary_label']

Binary label distribution:
binary_label
True     5781
False    5107
Name: count, dtype: int64


In [7]:
X_train_multiclass, X_test_multiclass, y_train_multiclass, y_test_multiclass = train_test_split(
    X, y_multiclass, test_size=0.2, random_state=42
)
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X, y_binary, test_size=0.2, random_state=42
)

In [8]:
# Function to evaluate a model using cross-validation and test performance
def evaluate_model(model, X_train, y_train, X_test, y_test):
    pipeline = make_pipeline(vectorizer, model)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    print(f"Cross-validation scores: {cross_val_scores}")
    print(f"Mean CV Accuracy: {cross_val_scores.mean()}")

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label='True')

    return accuracy, precision, recall, f1

In [9]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')

In [11]:
def evaluate_model(model, X_train, y_train, X_test, y_test, is_multiclass=False):
    pipeline = make_pipeline(vectorizer, model)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) if not is_multiclass else KFold(n_splits=500, shuffle=True, random_state=42)
    cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    print(f"Cross-validation scores: {cross_val_scores}")
    print(f"Mean CV Accuracy: {cross_val_scores.mean()}")

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if is_multiclass:
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    else:
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label='True')

    return accuracy, precision, recall, f1

# Binary Classification

In [16]:
print("Evaluating Binary Classification Models")
for name, model in models.items():
    print(f"\n{name} - Binary Classification")
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_binary, y_train_binary, X_test_binary, y_test_binary)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

Evaluating Binary Classification Models

Logistic Regression - Binary Classification
Cross-validation scores: [0.67967853 0.67164179 0.66590126 0.68541906 0.65671642]
Mean CV Accuracy: 0.6718714121699196
Accuracy: 0.6951331496786042
Precision: 0.6966640806826998
Recall: 0.7668659265584971
F1-Score: 0.7300813008130081

Naive Bayes - Binary Classification
Cross-validation scores: [0.66877153 0.67049369 0.6641791  0.68140069 0.6435132 ]
Mean CV Accuracy: 0.6656716417910448
Accuracy: 0.679981634527089
Precision: 0.6945812807881774
Recall: 0.7224594363791631
F1-Score: 0.7082461280870658

Support Vector Machine - Binary Classification
Cross-validation scores: [0.68771527 0.67106774 0.67623421 0.69919633 0.66245695]
Mean CV Accuracy: 0.6793340987370838
Accuracy: 0.699265381083563
Precision: 0.6942771084337349
Recall: 0.7873612297181896
F1-Score: 0.7378951580632254

Random Forest - Binary Classification
Cross-validation scores: [0.66819747 0.66762342 0.65671642 0.66532721 0.65040184]
Mean CV A

# Multiclass Classification

In [None]:
print("\nEvaluating Multiclass Classification Models")
for name, model in models.items():
    if isinstance(model, LogisticRegression):
        model.set_params(max_iter=200)

    print(f"\n{name} - Multiclass Classification")
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")


Evaluating Multiclass Classification Models

Logistic Regression - Multiclass Classification
Cross-validation scores: [0.61111111 0.38888889 0.5        0.38888889 0.5        0.66666667
 0.38888889 0.38888889 0.44444444 0.44444444 0.27777778 0.5
 0.38888889 0.22222222 0.38888889 0.55555556 0.61111111 0.44444444
 0.38888889 0.55555556 0.61111111 0.22222222 0.55555556 0.44444444
 0.44444444 0.77777778 0.66666667 0.5        0.66666667 0.61111111
 0.33333333 0.5        0.61111111 0.38888889 0.33333333 0.38888889
 0.61111111 0.61111111 0.38888889 0.5        0.5        0.55555556
 0.33333333 0.44444444 0.55555556 0.38888889 0.27777778 0.5
 0.61111111 0.33333333 0.55555556 0.55555556 0.44444444 0.38888889
 0.38888889 0.44444444 0.5        0.38888889 0.33333333 0.33333333
 0.61111111 0.44444444 0.33333333 0.44444444 0.61111111 0.5
 0.27777778 0.44444444 0.5        0.22222222 0.61111111 0.5
 0.5        0.44444444 0.5        0.44444444 0.38888889 0.55555556
 0.5        0.55555556 0.44444444 0.55

Changing the approach as the result is getting truncated.

In [12]:
def create_model(name, params):
    """Create and configure a model based on name and parameters."""
    if name == 'LogisticRegression':
        return LogisticRegression(**params)
    elif name == 'SVC':
        return SVC(**params)
    elif name == 'RandomForestClassifier':
        return RandomForestClassifier(**params)
    elif name == 'NaiveBayes':
        return MultinomialNB(**params)
    elif name == 'KNeighborsClassifier':
        return KNeighborsClassifier(**params)
    elif name == 'DecisionTreeClassifier':
        return DecisionTreeClassifier(**params)
    else:
        raise ValueError(f"Model '{name}' not recognized.")
# Perform cross-validation and return the mean accuracy
def perform_cross_validation(model, X_train, y_train):
    """Perform cross-validation and return mean accuracy."""
    pipeline = make_pipeline(vectorizer, model)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy', n_jobs=-1)
    return cross_val_scores.mean()

def evaluate_model(model, X_train, y_train, X_test, y_test, is_multiclass=False):
    """Evaluate the model and return performance metrics."""
    pipeline = make_pipeline(vectorizer, model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    if is_multiclass:
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    else:
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label='True')

    return accuracy, precision, recall, f1

def main(models, X_train, y_train, X_test, y_test, is_multiclass=False):
    """Main function to evaluate all models."""
    for name, params in models.items():
        model = create_model(name, params)
        print(f"\nEvaluating {name} - Multiclass Classification")


        mean_cv_accuracy = perform_cross_validation(model, X_train, y_train)
        print(f"Mean CV Accuracy: {mean_cv_accuracy}")


        accuracy, precision, recall, f1 = evaluate_model(model, X_train, y_train, X_test, y_test, is_multiclass)
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1-Score: {f1}")

In [13]:
models = {
    'LogisticRegression': {'max_iter': 200, 'verbose': 1}
}

main(models, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)


Evaluating LogisticRegression - Multiclass Classification
Mean CV Accuracy: 0.4605051664753157
Accuracy: 0.4834710743801653
Precision: 0.47878645648546886
Recall: 0.4834710743801653
F1-Score: 0.47662047138847335


In [14]:
models = {
    'SVC': {'kernel': 'linear', 'C': 1.0}
}
main(models, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)


Evaluating SVC - Multiclass Classification
Mean CV Accuracy: 0.46303099885189436
Accuracy: 0.47750229568411384
Precision: 0.47595132325045264
Recall: 0.47750229568411384
F1-Score: 0.4734575633368492


In [15]:
models = {
    'RandomForestClassifier': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
}
main(models, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)


Evaluating RandomForestClassifier - Multiclass Classification
Mean CV Accuracy: 0.34362801377726754
Accuracy: 0.33746556473829203
Precision: 0.5067920087842757
Recall: 0.33746556473829203
F1-Score: 0.26978841764234873


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
models = {
    'NaiveBayes': {}
}
main(models, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)


Evaluating NaiveBayes - Multiclass Classification
Mean CV Accuracy: 0.42078071182548793
Accuracy: 0.4251606978879706
Precision: 0.4797038613091633
Recall: 0.4251606978879706
F1-Score: 0.4077526916188231


In [17]:
models = {
    'KNeighborsClassifier': {'n_neighbors': 5, 'algorithm': 'auto'}
}
main(models, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)


Evaluating KNeighborsClassifier - Multiclass Classification
Mean CV Accuracy: 0.3680826636050517
Accuracy: 0.36363636363636365
Precision: 0.3974913594229862
Recall: 0.36363636363636365
F1-Score: 0.3542821640800049


In [18]:
models = {
    'DecisionTreeClassifier': {'max_depth': 5, 'random_state': 42}
}
main(models, X_train_multiclass, y_train_multiclass, X_test_multiclass, y_test_multiclass, is_multiclass=True)


Evaluating DecisionTreeClassifier - Multiclass Classification
Mean CV Accuracy: 0.31216991963260615
Accuracy: 0.3076216712580349
Precision: 0.46587169480755647
Recall: 0.3076216712580349
F1-Score: 0.23473987638072363


# To deal with overfitting and underfitting

Key Points:

*  Logistic Regression: Adjusting the C parameter to control regularization, max_iter for convergence, and using solver for optimization.

*  Naive Bayes: Smoothing using the alpha parameter to handle zero probabilities.

*   SVC: Regularization controlled by C, and kernel choice (linear) to match problem complexity.
* Random Forest: Control tree depth (max_depth) and number of estimators (n_estimators) to balance complexity and prevent overfitting.


* K-Nearest Neighbors: Adjust n_neighbors to balance bias and variance.

*  Decision Tree: Prune the tree by setting max_depth to limit complexity and avoid overfitting.







In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np

def plot_learning_curves(model, X_train, y_train, X_test, y_test):
    """
    Plot learning curves for a given model.
    """
    train_sizes, train_scores, validation_scores = learning_curve(
        model, X_train, y_train, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy'
    )

    train_scores_mean = np.mean(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, validation_scores_mean, 'o-', color='g', label='Validation score')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy')
    plt.title('Learning Curves')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

model = LogisticRegression(C=1.0, max_iter=200, solver='liblinear', verbose=1)
plot_learning_curves(model, X_train_binary, y_train_binary, X_test_binary, y_test_binary)


In [None]:
from sklearn.model_selection import validation_curve

def plot_validation_curve(model, X_train, y_train, param_name, param_range):
    """
    Plot validation curve for a given hyperparameter.
    """
    train_scores, validation_scores = validation_curve(
        model, X_train, y_train, param_name=param_name, param_range=param_range,
        cv=5, n_jobs=-1, scoring='accuracy'
    )

    train_scores_mean = np.mean(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(param_range, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(param_range, validation_scores_mean, 'o-', color='g', label='Validation score')
    plt.xlabel(param_name)
    plt.ylabel('Accuracy')
    plt.title('Validation Curve')
    plt.legend(loc='best')
    plt.grid()
    plt.show()
param_range = np.arange(1, 21)  # Depths from 1 to 20
model = RandomForestClassifier(n_estimators=100, random_state=42)
plot_validation_curve(model, X_train_binary, y_train_binary, param_name='max_depth', param_range=param_range)


In [None]:
# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(C=1.0, max_iter=200, solver='liblinear', verbose=1),
    'Naive Bayes': MultinomialNB(alpha=1.0),
    'SVC': SVC(C=1.0, kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, algorithm='auto'),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42)
}

# Plot learning curves for binary classification
for name, model in models.items():
    print(f"Plotting learning curve for {name}...")
    plot_learning_curves(model, X_train_binary, y_train_binary, X_test_binary, y_test_binary)

# Plot validation curves for key hyperparameters
print("Plotting validation curves for Random Forest...")
plot_validation_curve(RandomForestClassifier(n_estimators=100, random_state=42), X_train_binary, y_train_binary, 'max_depth', np.arange(1, 21))
