In [11]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

# Step 2: Define the preprocess_text Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = ' '.join(text.split())
    return text

# Step 3: Load Dataset
file_path = "C:/Users/ABHINAV RANA/Downloads/data.csv"   # <-- put correct path here
df = pd.read_csv(file_path)

# Step 4: Preprocess Data
# Clean column names
df.columns = df.columns.str.strip().str.replace('\ufeff', '')

# Rename 'Transcript' to 'transcript' if necessary
if 'Transcript' in df.columns:
    df.rename(columns={'Transcript': 'transcript'}, inplace=True)

# Clean 'label' column
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Preprocess text
df['cleaned_text'] = df['transcript'].apply(preprocess_text)

# Step 5: Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['cleaned_text'])  # **fit here**

# Target variable
y = df['label']

# Step 6: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 8: Evaluate Model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 9: Save Model and Vectorizer (Important)
joblib.dump(model, "C:/Users/ABHINAV RANA/Downloads/savedlogistic_regression_model.pkl")
joblib.dump(vectorizer, "C:/Users/ABHINAV RANA/Downloads/savedtfidf_vectorizer.pkl")

print("\n✅ Model and Vectorizer saved successfully!")



Model Evaluation:
Accuracy: 0.9378068739770867
Precision: 0.9353233830845771
Recall: 0.749003984063745
F1-Score: 0.831858407079646
Confusion Matrix:
 [[958  13]
 [ 63 188]]

✅ Model and Vectorizer saved successfully!


In [14]:
# Step 1: Import libraries
import speech_recognition as sr
import joblib
import re
import string

# Load the trained model and vectorizer
model = joblib.load("C:/Users/ABHINAV RANA/Downloads/savedlogistic_regression_model.pkl")
vectorizer = joblib.load("C:/Users/ABHINAV RANA/Downloads/savedtfidf_vectorizer.pkl")


# Step 3: Define the preprocessing function
def preprocess_text(text):
    """
    Preprocesses the text: lowercase, remove punctuation, remove extra whitespace.
    """
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = ' '.join(text.split())
    return text

# Step 4: Define function to recognize speech from microphone
def recognize_speech_from_mic():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("\nListening... Speak now!")
        recognizer.adjust_for_ambient_noise(source)
        try:
            audio = recognizer.listen(source, timeout=5)
            text = recognizer.recognize_google(audio)
            print("Recognized Text:", text)
            return text
        except sr.UnknownValueError:
            print("Could not understand the audio.")
            return None
        except sr.RequestError:
            print("Speech Recognition service is unavailable.")
            return None

# Step 5: Define function to recognize speech from audio file
def recognize_speech_from_audio_file(file_path):
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(file_path) as source:
            print("\nProcessing audio file...")
            audio = recognizer.record(source)
            text = recognizer.recognize_google(audio)
            print("Recognized Text:", text)
            return text
    except Exception as e:
        print("Error processing audio file:", str(e))
        return None

# Step 6: Define function to classify the recognized speech text
def classify_speech_text(text):
    if text:
        cleaned_text = preprocess_text(text)
        text_vector = vectorizer.transform([cleaned_text])  # Use fitted vectorizer
        prediction = model.predict(text_vector)[0]
        
        if prediction == 1:
            print("\n🔴 Fraudulent Call Detected!")
        else:
            print("\n🟢 Genuine Call.")
    else:
        print("\nNo valid text recognized.")

# Step 7: User input to choose between mic or audio file
print("\nChoose Input Method:")
print("1 - Speak into Microphone")
print("2 - Provide an Audio File")
choice = input("Enter choice (1/2): ").strip()

if choice == "1":
    spoken_text = recognize_speech_from_mic()
    classify_speech_text(spoken_text)

elif choice == "2":
    file_path = input("Enter the path to the audio file: ").strip()
    spoken_text = recognize_speech_from_audio_file(file_path)
    classify_speech_text(spoken_text)

else:
    print("Invalid choice! Please select either 1 or 2.")



Choose Input Method:
1 - Speak into Microphone
2 - Provide an Audio File


Enter choice (1/2):  2
Enter the path to the audio file:  C:/Users/ABHINAV RANA/Downloads/NewRecording2.wav



Processing audio file...
Recognized Text: hello I am calling from SBI Bank the KYC of your bank hasn't been completed so your credit card will be blocked here to urgently update the KYC of a credit card

🔴 Fraudulent Call Detected!
