In [1]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# 2. Load your datasets (training, validation, and test data)
print("Loading datasets...")
train_data = pd.read_csv('train_data.csv')  # Load your training data here
val_data = pd.read_csv('val_data.csv')      # Load your validation data here
test_data = pd.read_csv('test_data.csv')    # Load your test data here
print("Datasets loaded.")

# 4. Define valid labels for the 'type' column (adjust as needed)
valid_labels = ['fake', 'reliable']  # Define the valid labels you expect

# 5. Filter out rows with invalid labels
print("Filtering out invalid labels...")
train_data = train_data[train_data['type'].isin(valid_labels)]
val_data = val_data[val_data['type'].isin(valid_labels)]
test_data = test_data[test_data['type'].isin(valid_labels)]
print("Filtering complete.")

# 7. Label encoding: Fit on training data and then transform on validation and test data
print("Encoding labels...")
label_encoder = LabelEncoder()

# Fit the encoder on training labels only
train_data['type'] = label_encoder.fit_transform(train_data['type'])

# Transform the validation and test labels
val_data['type'] = label_encoder.transform(val_data['type'])
test_data['type'] = label_encoder.transform(test_data['type'])
print("Label encoding complete.")

# 8. Feature Extraction: Using TfidfVectorizer
# Reduce the number of features to make it faster, also exclude stop words
print("Extracting features using TfidfVectorizer...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Reduce features to 1000

# 9. Build the SVM model pipeline (vectorization + SVM classifier)
print("Building SVM pipeline...")
svm_model = make_pipeline(tfidf_vectorizer, SVC(kernel='linear', random_state=42, C=1))  # Start with linear kernel and C=1 for speed
print("SVM pipeline built.")

# 10. Train the SVM model on training data
print("Training SVM model...")
svm_model.fit(train_data['content'], train_data['type'])
print("Model training complete.")

# 11. Predict on validation data
print("Making predictions on validation data...")
val_predictions = svm_model.predict(val_data['content'])
print("Predictions on validation data complete.")

# 12. Evaluate on validation data
print("Validation Classification Report:")
print(classification_report(val_data['type'], val_predictions))

# 13. Hyperparameter Tuning (Optional): Example with GridSearchCV
print("Performing hyperparameter tuning with GridSearchCV...")
parameters = {'svc__C': [0.1, 1]}  # Reduce search space for speed (no 'rbf' kernel)
grid_search = GridSearchCV(svm_model, parameters, cv=3, n_jobs=-1)  # Use multiple cores for grid search
grid_search.fit(train_data['content'], train_data['type'])
print(f"Best parameters found: {grid_search.best_params_}")

# 14. Retrain model with best parameters from GridSearchCV (optional step)
best_svm_model = grid_search.best_estimator_

# 15. Test the final model on the test data
print("Making predictions on test data...")
test_predictions = best_svm_model.predict(test_data['content'])
print("Predictions on test data complete.")

# 16. Evaluate on test data
print("Test Classification Report:")
print(classification_report(test_data['type'], test_predictions))

# 17. Confusion Matrix (optional)
print("Generating confusion matrix...")
conf_matrix = confusion_matrix(test_data['type'], test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Loading datasets...
Datasets loaded.
Filtering out invalid labels...
Filtering complete.
Encoding labels...
Label encoding complete.
Extracting features using TfidfVectorizer...
Building SVM pipeline...
SVM pipeline built.
Training SVM model...
Model training complete.
Making predictions on validation data...
Predictions on validation data complete.
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91     10621
           1       0.95      0.96      0.96     21723

    accuracy                           0.94     32344
   macro avg       0.93      0.93      0.93     32344
weighted avg       0.94      0.94      0.94     32344

Performing hyperparameter tuning with GridSearchCV...
Best parameters found: {'svc__C': 1}
Making predictions on test data...
Predictions on test data complete.
Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91

In [3]:
import joblib

# Save the best model after hyperparameter tuning
print("Saving the model...")
joblib.dump(best_svm_model, 'best_svm_model.pkl')
print("Model saved.")

# If you want to save the entire pipeline (including the vectorizer and classifier):
joblib.dump(svm_model, 'svm_pipeline.pkl')

# Save the label encoder
print("Saving label encoder...")
joblib.dump(label_encoder, 'label_encoder.pkl')
print("Label encoder saved.")

# Save classification report for validation and test
with open('validation_classification_report.txt', 'w') as f:
    f.write("Validation Classification Report:\n")
    f.write(classification_report(val_data['type'], val_predictions))

with open('test_classification_report.txt', 'w') as f:
    f.write("Test Classification Report:\n")
    f.write(classification_report(test_data['type'], test_predictions))

# Save confusion matrix as CSV
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
conf_matrix_df.to_csv('confusion_matrix.csv')




# Reload the saved model
best_svm_model = joblib.load('best_svm_model.pkl')
# Or reload the full pipeline
svm_model = joblib.load('svm_pipeline.pkl')

# Reload the label encoder
label_encoder = joblib.load('label_encoder.pkl')

# Reload classification reports (if needed)
with open('validation_classification_report.txt', 'r') as f:
    validation_report = f.read()

with open('test_classification_report.txt', 'r') as f:
    test_report = f.read()

# Reload confusion matrix
conf_matrix_df = pd.read_csv('confusion_matrix.csv', index_col=0)


print("Confusion Matrix:")
print(conf_matrix_df)

Saving the model...
Model saved.
Saving label encoder...
Label encoder saved.
Confusion Matrix:
          fake  reliable
fake      9311      1107
reliable   764     21170


In [5]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# 2. Load your datasets (training, validation, and test data)
print("Loading datasets...")
train_data = pd.read_csv('liar_train.tsv', sep='\t')  # Load your training data here
val_data = pd.read_csv('liar_valid.tsv', sep='\t')    # Load your validation data here
test_data = pd.read_csv('liar_test.tsv', sep='\t')    # Load your test data here
print("Datasets loaded.")

train_data.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'source']
val_data.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'source']
test_data.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'source']


# 3. Define valid labels for the 'label' column (adjust as needed)
valid_labels = ['false', 'true', 'barely-true', 'mostly-true', 'half-true', 'pants-fire']  # Define the valid labels you expect

# 4. Filter out rows with invalid labels
print("Filtering out invalid labels...")
train_data = train_data[train_data['label'].isin(valid_labels)]
val_data = val_data[val_data['label'].isin(valid_labels)]
test_data = test_data[test_data['label'].isin(valid_labels)]
print("Filtering complete.")

# 5. Label encoding: Fit on training data and then transform on validation and test data
print("Encoding labels...")
label_encoder = LabelEncoder()

# Fit the encoder on training labels only
train_data['label'] = label_encoder.fit_transform(train_data['label'])

# Transform the validation and test labels
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])
print("Label encoding complete.")

# 6. Feature Extraction: Using TfidfVectorizer
# Reduce the number of features to make it faster, also exclude stop words
print("Extracting features using TfidfVectorizer...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Reduce features to 1000

# 7. Build the SVM model pipeline (vectorization + SVM classifier)
print("Building SVM pipeline...")
svm_model = make_pipeline(tfidf_vectorizer, SVC(kernel='linear', random_state=42, C=1))  # Start with linear kernel and C=1 for speed
print("SVM pipeline built.")

# 8. Train the SVM model on training data
print("Training SVM model...")
svm_model.fit(train_data['statement'], train_data['label'])  # Use 'statement' column for training
print("Model training complete.")

# 9. Predict on validation data
print("Making predictions on validation data...")
val_predictions = svm_model.predict(val_data['statement'])  # Use 'statement' column for validation
print("Predictions on validation data complete.")

# 10. Evaluate on validation data
print("Validation Classification Report:")
print(classification_report(val_data['label'], val_predictions))

# 11. Hyperparameter Tuning (Optional): Example with GridSearchCV
print("Performing hyperparameter tuning with GridSearchCV...")
parameters = {'svc__C': [0.1, 1]}  # Reduce search space for speed (no 'rbf' kernel)
grid_search = GridSearchCV(svm_model, parameters, cv=3, n_jobs=-1)  # Use multiple cores for grid search
grid_search.fit(train_data['statement'], train_data['label'])
print(f"Best parameters found: {grid_search.best_params_}")

# 12. Retrain model with best parameters from GridSearchCV (optional step)
best_svm_model = grid_search.best_estimator_

# 13. Test the final model on the test data
print("Making predictions on test data...")
test_predictions = best_svm_model.predict(test_data['statement'])  # Use 'statement' column for testing
print("Predictions on test data complete.")

# 14. Evaluate on test data
print("Test Classification Report:")
print(classification_report(test_data['label'], test_predictions))

# 15. Confusion Matrix (optional)
print("Generating confusion matrix...")
conf_matrix = confusion_matrix(test_data['label'], test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Loading datasets...
Datasets loaded.
Filtering out invalid labels...
Filtering complete.
Encoding labels...
Label encoding complete.
Extracting features using TfidfVectorizer...
Building SVM pipeline...
SVM pipeline built.
Training SVM model...
Model training complete.
Making predictions on validation data...
Predictions on validation data complete.
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.18      0.14      0.16       236
           1       0.25      0.35      0.29       263
           2       0.24      0.29      0.26       248
           3       0.27      0.25      0.26       251
           4       0.36      0.07      0.12       116
           5       0.19      0.18      0.18       169

    accuracy                           0.23      1283
   macro avg       0.25      0.21      0.21      1283
weighted avg       0.24      0.23      0.23      1283

Performing hyperparameter tuning with GridSearchCV...
Best parameters f