In [None]:
#Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import os
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [None]:
#Loading the cleaned dataset
df = pd.read_csv(r"C:\Users\gopic\Downloads\1000 leads cleaned.csv")
df.head(3)

Unnamed: 0,Lead Name,Location,Status,Status information,status_cleaned,interest_status,business_executive
0,Raja,hyderabad,Not Converted,"14/8/prema: share me details, available in evn...",Converted,Interested,Unknown
1,Anirudh Reddy,pune,Not Converted,"14/8/prema: cal me tmrw, shared details to ema...",Converted,Interested,Unknown
2,Sapna Dewani,bangalore,Converted,16|AuG|moHan:rnr,Converted,Interested,Unknown


In [4]:
# Keep only rows with a known label and non-empty text

df = df[df['interest_status'].isin(['Interested', 'Not Interested'])]
df = df[~df['Status information'].isna()]
df.head(3)

Unnamed: 0,Lead Name,Location,Status,Status information,status_cleaned,interest_status,business_executive
0,Raja,hyderabad,Not Converted,"14/8/prema: share me details, available in evn...",Converted,Interested,Unknown
1,Anirudh Reddy,pune,Not Converted,"14/8/prema: cal me tmrw, shared details to ema...",Converted,Interested,Unknown
2,Sapna Dewani,bangalore,Converted,16|AuG|moHan:rnr,Converted,Interested,Unknown


In [5]:
#Simple text cleaning function

def clean_text(s):
    s = str(s).lower()  # Convert to lowercase
    #removing dated formats
    s = re.sub(r'\d{1,2}\/\d{1,2}\/\d{2,4}', ' ', s)
    # remove non-letters except spaces
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['text_cleaned'] = df['Status information'].astype(str).apply(clean_text)
df[['Status information', 'text_cleaned']].head(3)

Unnamed: 0,Status information,text_cleaned
0,"14/8/prema: share me details, available in evn...",14 8 prema share me details available in evng ...
1,"14/8/prema: cal me tmrw, shared details to ema...",14 8 prema cal me tmrw shared details to email...
2,16|AuG|moHan:rnr,16 aug mohan rnr


In [6]:
x = df['text_cleaned']
y = df['interest_status'].map({'Interested': 1, 'Not Interested': 0})

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# pipeline for TF-IDF and Logistic Regression
from sklearn.dummy import DummyClassifier


if y_train.nunique() < 2:
    print("Warning: training data contains only one class:", y_train.unique())
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', DummyClassifier(strategy='most_frequent'))
    ])
else:
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
    ])

pipeline.fit(x_train, y_train)
print("Model trained. Classifier:", pipeline.named_steps['clf'].__class__.__name__)
print("Training label distribution:\n", y_train.value_counts())

Model trained. Classifier: DummyClassifier
Training label distribution:
 interest_status
1    773
Name: count, dtype: int64


In [13]:
#Evaluate

y_pred = pipeline.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00       194

    accuracy                           1.00       194
   macro avg       1.00      1.00      1.00       194
weighted avg       1.00      1.00      1.00       194

Confusion Matrix:
 [[194]]


In [14]:
#save pipeline
with open('nlp_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
print("NLP pipeline saved successfully.")

NLP pipeline saved successfully.
