In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

print("Libraries imported successfully!")

  from pandas.core import (


Libraries imported successfully!


In [2]:
# Download NLTK resources if not already downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    print("NLTK stopwords downloaded.")

try:
    WordNetLemmatizer()
except LookupError:
    nltk.download('wordnet')
    nltk.download('omw-1.4') # Open Multilingual Wordnet
    print("NLTK wordnet and omw-1.4 downloaded.")

print("NLTK resources checked/downloaded.")

NLTK resources checked/downloaded.


Text Preprocessing Function

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Lowercasing
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # Lemmatization
    return text

print("Preprocessing function defined.")

Preprocessing function defined.


Load and Combine Datasets

In [9]:
# --- Configuration for paths ---
spam_path = 'data/spam.csv'
fraud_path = 'data/fraud_call.file' # Corrected file name!

# --- Load spam.csv ---
try:
    # Based on the screenshot: it's comma-separated, has a header,
    # and messages contain commas but are NOT quoted.
    # The most robust way for this specific structure
    # is often to read it line by line and manually split.
    
    data = []
    with open(spam_path, 'r', encoding='latin-1') as f:
        header_skipped = False
        for line_num, line in enumerate(f, 1):
            if not header_skipped: # Skip the header line (v1,v2)
                header_skipped = True
                continue

            line = line.strip()
            if not line: # Skip empty lines
                continue

            # Find the index of the first comma after the 'label' (v1)
            # This assumes 'v1' (label) does not contain commas.
            first_comma_idx = line.find(',')
            if first_comma_idx != -1:
                label = line[:first_comma_idx].strip()
                message = line[first_comma_idx+1:].strip()
                data.append({'label': label, 'message': message})
            else:
                # This could happen if a line only has a label and no message, or is just malformed
                print(f"Warning: Line {line_num} in spam.csv has no comma to split label and message: '{line}' - skipping.")
    
    spam_df = pd.DataFrame(data)
    spam_df['label'] = spam_df['label'].map({'ham': 'normal', 'spam': 'fraud'}) # Standardize labels
    print(f"Loaded spam.csv. Shape: {spam_df.shape}")
    print("Spam DataFrame Head:")
    print(spam_df.head())
except Exception as e:
    print(f"Error loading spam.csv: {e}")
    spam_df = pd.DataFrame() # Create empty DataFrame to avoid errors later


# --- Load fraud_call.file ---
try:
    # Based on screenshot: tab-separated, no header.
    # Use on_bad_lines='skip' to gracefully handle malformed lines like line 456
    # The `engine='python'` can sometimes be more lenient with line parsing as well.
    fraud_df = pd.read_csv(fraud_path, sep='\t', encoding='latin-1', header=None, names=['label', 'message'],
                           lineterminator='\n', # Ensure correct line ending detection
                           on_bad_lines='skip',  # Skip lines that don't match expected fields (e.g., line 456 with 3 fields)
                           engine='python') # Python engine is more flexible for bad lines/complex cases
    print(f"\nLoaded fraud_call.file. Shape: {fraud_df.shape}")
    print("Fraud DataFrame Head:")
    print(fraud_df.head())
except Exception as e:
    print(f"Error loading fraud_call.file: {e}")
    fraud_df = pd.DataFrame() # Create empty DataFrame to avoid errors later

# --- Combine datasets ---
if not spam_df.empty and not fraud_df.empty:
    combined_df = pd.concat([spam_df, fraud_df], ignore_index=True)
    print(f"\nCombined DataFrame Shape: {combined_df.shape}")
    print("Combined DataFrame Label Distribution:")
    print(combined_df['label'].value_counts())
elif not spam_df.empty:
    combined_df = spam_df
    print(f"\nOnly spam_df loaded. Combined DataFrame Shape: {combined_df.shape}")
elif not fraud_df.empty:
    combined_df = fraud_df
    print(f"\nOnly fraud_call.file loaded. Combined DataFrame Shape: {combined_df.shape}")
else:
    print("\nNo data loaded successfully. Check file paths and contents.")
    combined_df = pd.DataFrame()

# Display some combined data
if not combined_df.empty:
    print("\nCombined DataFrame Sample:")
    print(combined_df.sample(5))

Loaded spam.csv. Shape: (5574, 2)
Spam DataFrame Head:
    label                                            message
0  normal  "Go until jurong point, crazy.. Available only...
1  normal                   Ok lar... Joking wif u oni...,,,
2   fraud  Free entry in 2 a wkly comp to win FA Cup fina...
3  normal  U dun say so early hor... U c already then say...
4  normal  "Nah I don't think he goes to usf, he lives ar...
Error loading fraud_call.file: Custom line terminators not supported in python parser (yet)

Only spam_df loaded. Combined DataFrame Shape: (5574, 2)

Combined DataFrame Sample:
       label                                            message
4387  normal  ", im .. On the snowboarding trip. I was wonde...
3993  normal  "Dizzamn, aight I'll ask my suitemates when I ...
2249  normal                 will you like to be spoiled? :),,,
2273  normal  "Haha awesome, I've been to 4u a couple times....
2340  normal  Cheers for the message Zogtorius. IåÕve been s...


Applying Preprocessing

In [10]:
if not combined_df.empty:
    print("Applying preprocessing to messages...")
    combined_df['processed_message'] = combined_df['message'].apply(preprocess_text)
    print("Preprocessing complete!")
    print("\nCombined DataFrame with processed messages (sample):")
    print(combined_df[['message', 'processed_message', 'label']].sample(5))
else:
    print("No data to preprocess.")

Applying preprocessing to messages...
Preprocessing complete!

Combined DataFrame with processed messages (sample):
                                                message  \
2665                   He remains a bro amongst bros,,,   
3056  EASTENDERS TV Quiz. What FLOWER does DOT compa...   
2601  "As usual..iam fine, happy &amp; doing well..:...   
4141  Leave it wif me lar... ÌÏ wan to carry meh so ...   
5544        Armand says get your ass over to epsilon,,,   

                                      processed_message   label  
2665                           remains bro amongst bros  normal  
3056  eastenders tv quiz flower dot compare violet e...   fraud  
2601                       usualiam fine happy amp well  normal  
4141  leave wif lar ìï wan carry meh heavy da num fa...  normal  
5544                          armand say get as epsilon  normal  


Preparing Data for Training

In [12]:
if not combined_df.empty:
    # Drop rows with missing processed messages or labels
    combined_df = combined_df.dropna(subset=['processed_message', 'label'])

    # Features and labels
    X = combined_df['processed_message']
    y = combined_df['label']

    # Split data into training and testing sets
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Training data size: {len(X_train)}")
    print(f"Testing data size: {len(X_test)}")
    print("Data split into training and testing sets.")
else:
    print("No data available for training. Check previous cells.")

Training data size: 4457
Testing data size: 1115
Data split into training and testing sets.


Initializing and Training TF-IDF Vectorizer

In [13]:
if 'X_train' in locals() and not X_train.empty:
    vectorizer = TfidfVectorizer(max_features=5000)

    print("Fitting TF-IDF Vectorizer on training data...")
    X_train_vectorized = vectorizer.fit_transform(X_train)
    print("TF-IDF Vectorizer fitted.")
    print(f"Shape of vectorized training data: {X_train_vectorized.shape}")

    # Transform test data
    X_test_vectorized = vectorizer.transform(X_test)
    print(f"Shape of vectorized test data: {X_test_vectorized.shape}")
else:
    print("Training data not available for vectorization. Check previous cells.")

Fitting TF-IDF Vectorizer on training data...
TF-IDF Vectorizer fitted.
Shape of vectorized training data: (4457, 5000)
Shape of vectorized test data: (1115, 5000)


Initializing and Training the Classifier (LinearSVC)

In [14]:
if 'X_train_vectorized' in locals():
    classifier = LinearSVC(dual='auto', random_state=42) # Set dual='auto' to silence FutureWarning

    print("\nTraining LinearSVC Classifier...")
    classifier.fit(X_train_vectorized, y_train)
    print("Classifier training complete!")

    # Make predictions and evaluate
    y_pred = classifier.predict(X_test_vectorized)
    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
else:
    print("Vectorized training data not available for classifier training. Check previous cells.")


Training LinearSVC Classifier...
Classifier training complete!

--- Model Evaluation ---
Accuracy: 0.9821
Classification Report:
               precision    recall  f1-score   support

       fraud       1.00      0.87      0.93       149
      normal       0.98      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



 Saving the Trained Vectorizer and Classifier

In [15]:
model_dir = 'models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Created directory: {model_dir}")

if 'vectorizer' in locals() and 'classifier' in locals():
    vectorizer_path = os.path.join(model_dir, 'tfidf_vectorizer.pkl')
    model_path = os.path.join(model_dir, 'text_classifier_model.pkl')

    joblib.dump(vectorizer, vectorizer_path)
    joblib.dump(classifier, model_path)
    print(f"\nModel and Vectorizer saved to '{model_dir}' directory:")
    print(f"- {vectorizer_path}")
    print(f"- {model_path}")
else:
    print("Vectorizer or Classifier not trained. Cannot save models.")

print("\n--- Training and Model Saving Complete! ---")
print("You can now use 'app.py' to load these models and run the Streamlit GUI.")

Created directory: models

Model and Vectorizer saved to 'models' directory:
- models\tfidf_vectorizer.pkl
- models\text_classifier_model.pkl

--- Training and Model Saving Complete! ---
You can now use 'app.py' to load these models and run the Streamlit GUI.
