### Preprocessing the Data (LIAR Dataset)

In [16]:
# data_loader
import pandas as pd

def load_liar_dataset():
    """
    Load and prepare the LIAR dataset
    """
    # Column names for the TSV file
    columns = ['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job', 
              'state_info', 'party_affiliation', 'barely_true_counts', 
              'false_counts', 'half_true_counts', 'mostly_true_counts', 
              'pants_on_fire_counts', 'context']
    
    try:
        # Read the TSV file
        df = pd.read_csv('train.tsv', sep='\t', names=columns)
        
        # Convert to binary classification
        label_map = {
            'true': 1, 'mostly-true': 1, 'half-true': 1,
            'barely-true': 0, 'false': 0, 'pants-fire': 0
        }
        df['label_binary'] = df['label'].map(label_map)
        
        print("Dataset Overview:")
        print("-" * 50)
        print(f"Total samples: {len(df)}")
        print("\nFirst few rows:")
        print(df[['statement', 'label', 'label_binary', 'speaker', 'context']].head())
        print("\nClass distribution (Binary):")
        print(df['label_binary'].value_counts(normalize=True))
        
        return df
        
    except FileNotFoundError:
        print("Error: Dataset file not found. Please check if train.tsv exists in the current directory.")
        return None

### Pre-Process data

In [17]:
# preprocessor
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

def preprocess_text(text):
    """
    Preprocess the text data
    """
    try:
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error in text preprocessing: {str(e)}")
        return ""

def prepare_features(df):
    """
    Prepare text and metadata features with memory optimization
    """
    try:
        print("Processing text features...")
        # Process text in smaller chunks
        chunk_size = 1000
        processed_texts = []
        
        for i in range(0, len(df), chunk_size):
            chunk = df['statement'].iloc[i:i+chunk_size]
            processed_chunk = [preprocess_text(text) for text in chunk]
            processed_texts.extend(processed_chunk)
        
        df['processed_statement'] = processed_texts
        
        # Initialize TF-IDF with memory-efficient parameters
        tfidf = TfidfVectorizer(
            max_features=5000,  # Fixed number of text features
            ngram_range=(1, 2),
            min_df=3,
            max_df=0.9,
            strip_accents='unicode',
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=True
        )
        
        print("Vectorizing text...")
        text_features = tfidf.fit_transform(df['processed_statement'])
        
        print("Processing metadata features...")
        le = LabelEncoder()
        
        # Process categorical features (4 features)
        categorical_columns = ['speaker', 'subject', 'party_affiliation', 'context']
        encoded_features = []
        
        for col in categorical_columns:
            encoded_col = le.fit_transform(df[col].astype(str))
            encoded_features.append(encoded_col.reshape(-1, 1))
        
        # Process numerical features (5 features)
        numerical_columns = [
            'barely_true_counts', 'false_counts',
            'half_true_counts', 'mostly_true_counts', 
            'pants_on_fire_counts'
        ]
        
        # Scale numerical features
        scaler = StandardScaler()
        numerical_features = scaler.fit_transform(df[numerical_columns])
        
        # Calculate credibility score (1 feature)
        credibility_score = (
            (df['mostly_true_counts']) /
            (df['false_counts'] + df['pants_on_fire_counts'] + 1)
        ).values.reshape(-1, 1)
        
        print("Combining features...")
        # Convert sparse matrix to array and combine features
        all_features = np.hstack(
            [text_features.toarray()] + 
            encoded_features + 
            [numerical_features] +
            [credibility_score]
        )
        
        print(f"Total features: {all_features.shape[1]}")  # Should be 5010
        return all_features, tfidf
        
    except Exception as e:
        print(f"Error in feature preparation: {str(e)}")
        raise

### Training the model

In [18]:
# model_trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score
import xgboost as xgb

def train_model(X, y):
    """
    Train the XGBoost model with focus on precision
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print("Training model...")
    # XGBoost parameters optimized for precision
    model = xgb.XGBClassifier(
        n_estimators=200,          # More trees
        learning_rate=0.05,        # Slower learning rate
        max_depth=6,              # Slightly deeper trees
        min_child_weight=2,       # More conservative splits
        subsample=0.8,            # Prevent overfitting
        colsample_bytree=0.8,     # Prevent overfitting
        scale_pos_weight=1,       # Balance precision
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    # Train the model (simplified fit call)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=True
    )
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Print evaluation metrics
    print("\nModel Evaluation:")
    print("-" * 50)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nPrecision Score (our priority metric):")
    print(precision_score(y_test, y_pred))
    
    # Print confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    
    return model, X_test, y_test

In [19]:
# Save the model and vectorizer
import joblib

def save_model_and_vectorizer(model, vectorizer, model_path='fake_news_model.joblib', vectorizer_path='tfidf_vectorizer.joblib'):
    """
    Save the trained model and vectorizer
    """
    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)
    print(f"Model saved to {model_path}")
    print(f"Vectorizer saved to {vectorizer_path}")

# Prediction function for new texts
def predict_news(text, model, vectorizer):
    """
    Predict if a news text is fake or real
    """
    try:
        # Preprocess the text
        processed_text = preprocess_text(text)
        
        # Transform using saved vectorizer
        text_features = vectorizer.transform([processed_text]).toarray()
        
        # Add dummy metadata features (10 features to match training)
        # - 4 categorical features (speaker, subject, party_affiliation, context)
        # - 5 numerical features (barely_true_counts, false_counts, etc.)
        # - 1 credibility score
        dummy_metadata = np.zeros((1, 10))  # Create exactly 10 dummy features
        
        # Combine features
        combined_features = np.hstack([text_features, dummy_metadata])
        
        # Make prediction
        prediction = model.predict(combined_features)[0]
        probability = model.predict_proba(combined_features)[0]
        
        return {
            'prediction': 'True' if prediction == 1 else 'False',
            'confidence': float(max(probability)),
            'probabilities': {
                'false': float(probability[0]),
                'true': float(probability[1])
            }
        }
    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return {
            'error': str(e),
            'prediction': 'Error',
            'confidence': 0.0,
            'probabilities': {'false': 0.0, 'true': 0.0}
        }

# Test function
def test_prediction_pipeline(model, vectorizer):
    """
    Test the prediction pipeline with some example texts
    """
    test_texts = [
        "The company exceeded quarterly earnings expectations with a 25% revenue growth.",
        "Government officials deny any involvement in the controversial policy change.",
        "Studies show that the new policy has had mixed results across different regions.",
        "The CEO announced record profits despite market challenges.",
        "Anonymous sources claim massive layoffs are planned for next month."
    ]
    
    print("\nTesting prediction pipeline:")
    print("-" * 50)
    for text in test_texts:
        try:
            result = predict_news(text, model, vectorizer)
            if 'error' in result:
                print(f"\nError processing text: {text}")
                print(f"Error message: {result['error']}")
            else:
                print(f"\nText: {text}")
                print(f"Prediction: {result['prediction']}")
                print(f"Confidence: {result['confidence']:.2%}")
                print(f"True probability: {result['probabilities']['true']:.2%}")
                print(f"False probability: {result['probabilities']['false']:.2%}")
        except Exception as e:
            print(f"\nError processing text: {text}")
            print(f"Error message: {str(e)}")

### Main function & saving the trained model

In [20]:
# Main execution
def main():
    try:
        # Load dataset
        print("Loading dataset...")
        df = load_liar_dataset()
        
        if df is not None:
            # Prepare features (now only text features)
            print("Preparing features...")
            X, tfidf_vectorizer = prepare_features(df)
            y = df['label_binary'].values
            
            # Train model
            print("Training model...")
            model, X_test, y_test = train_model(X, y)
            
            # Save model and vectorizer
            save_model_and_vectorizer(model, tfidf_vectorizer)
            
            # Test the pipeline
            test_prediction_pipeline(model, tfidf_vectorizer)
            
            return model, tfidf_vectorizer
        
        return None, None
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        return None, None

if __name__ == "__main__":
    model, vectorizer = main()

Loading dataset...
Dataset Overview:
--------------------------------------------------
Total samples: 10240

First few rows:
                                           statement        label  \
0  Says the Annies List political group supports ...        false   
1  When did the decline of coal start? It started...    half-true   
2  Hillary Clinton agrees with John McCain "by vo...  mostly-true   
3  Health care reform legislation is likely to ma...        false   
4  The economic turnaround started at the end of ...    half-true   

   label_binary         speaker              context  
0             0    dwayne-bohac             a mailer  
1             1  scott-surovell      a floor speech.  
2             1    barack-obama               Denver  
3             0    blog-posting       a news release  
4             1   charlie-crist  an interview on CNN  

Class distribution (Binary):
label_binary
1    0.561719
0    0.438281
Name: proportion, dtype: float64
Preparing features...
Pro

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-logloss:0.67743
[1]	validation_0-logloss:0.66646
[2]	validation_0-logloss:0.65341
[3]	validation_0-logloss:0.64545
[4]	validation_0-logloss:0.63605
[5]	validation_0-logloss:0.62789
[6]	validation_0-logloss:0.62068
[7]	validation_0-logloss:0.61335
[8]	validation_0-logloss:0.60484
[9]	validation_0-logloss:0.59722
[10]	validation_0-logloss:0.59033
[11]	validation_0-logloss:0.58581
[12]	validation_0-logloss:0.57996
[13]	validation_0-logloss:0.57435
[14]	validation_0-logloss:0.57085
[15]	validation_0-logloss:0.56696
[16]	validation_0-logloss:0.56371
[17]	validation_0-logloss:0.56074
[18]	validation_0-logloss:0.55894
[19]	validation_0-logloss:0.55715
[20]	validation_0-logloss:0.55302
[21]	validation_0-logloss:0.54901
[22]	validation_0-logloss:0.54768
[23]	validation_0-logloss:0.54543
[24]	validation_0-logloss:0.54368
[25]	validation_0-logloss:0.54110
[26]	validation_0-logloss:0.53943
[27]	validation_0-logloss:0.53643
[28]	validation_0-logloss:0.53372
[29]	validation_0-loglos