In [1]:
import numpy as np
import pandas as pd
import re
# from tqdm import tqdm
# from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import f1_score, accuracy_score
import joblib


In [2]:
c_fraud_df = pd.read_csv("corrected_names.csv")
c_fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17590 entries, 0 to 17589
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Caller Number            17590 non-null  object
 1   Call Time                17590 non-null  object
 2   Call Duration(in s)      17590 non-null  int64 
 3   Call Frequency Per Day   17590 non-null  int64 
 4   Call Frequency Per Week  17590 non-null  int64 
 5   Call Type                17590 non-null  object
 6   Conversation             17590 non-null  object
dtypes: int64(3), object(4)
memory usage: 962.1+ KB


In [3]:
c_fraud_df.dtypes

Caller Number              object
Call Time                  object
Call Duration(in s)         int64
Call Frequency Per Day      int64
Call Frequency Per Week     int64
Call Type                  object
Conversation               object
dtype: object

In [4]:
c_fraud_df.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Call Type,Conversation
0,+1-917-555-3286,08:50:33,62,3,25,Normal,"Hope. Since ago travelled, he has forgotten hi..."
1,+1-205-795-8028,17:05:53,78,3,31,Normal,Horrible you eat mass eat until u forgot about...
2,+1-702-412-9422,16:29:51,55,4,7,Normal,Great! I have to run now so tell!
3,+1-915-426-2998,17:36:33,48,9,14,Normal,"Sorry, I'll call later"
4,+1-915-543-9437,20:19:36,262,3,16,Normal,"May wants to work out first, how's 4 sound?"


## Splitting the data

In [5]:
from sklearn.model_selection import train_test_split

def split_data_stratified(X, y, test_size=0.2, eval_size=0.1, random_state=42):
    
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=test_size + eval_size, random_state=random_state, stratify=y)
    
    X_eval, X_test, y_eval, y_test = train_test_split(
        X_temp, y_temp, test_size=test_size / (test_size + eval_size), random_state=random_state, stratify=y_temp)
    
    del X_temp,y_temp
    
    return X_train, X_eval, X_test, y_train, y_eval, y_test


In [6]:
#Dividing columns to features and labels
y = c_fraud_df['Call Type']
X = c_fraud_df.drop(columns='Call Type', axis=1)

In [7]:
X_train, X_eval, X_test, y_train, y_eval, y_test = split_data_stratified(X,y)
X_train = X_train.reset_index(drop=True)
X_eval = X_eval.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_eval = y_eval.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
X_train.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-917-436-9480,12:02:10,1325,5,10,whiskey Brandy Sum In Beer Vodka Scotch Shampa...
1,+1-718-745-1135,12:05:16,1572,12,30,He detected a failed login attempt on your ba...
2,+1-213-329-9734,22:06:51,445,1,26,"Up to you, you want come then come for... But ..."
3,+1-628-329-7445,21:20:38,243,5,17,My Anna is breaking me out. The's looked thou ...
4,+1-417-861-6252,09:32:34,373,4,16,Key are you angry with me. Reply me dr.


In [9]:
X_train.dtypes

Caller Number              object
Call Time                  object
Call Duration(in s)         int64
Call Frequency Per Day      int64
Call Frequency Per Week     int64
Conversation               object
dtype: object

## Oversampling Data to handle data imbalance

In [10]:
a = y_train.eq('Spam').sum()
b = y_train.eq('Scam').sum()
print(a)
print(b)

1206
325


In [11]:
rsa = RandomOverSampler( sampling_strategy={'Spam': 1757,'Scam':683},random_state=42)

X_resampled, y_resampled = rsa.fit_resample(X_train, y_train)

In [12]:
X_resampled.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-917-436-9480,12:02:10,1325,5,10,whiskey Brandy Sum In Beer Vodka Scotch Shampa...
1,+1-718-745-1135,12:05:16,1572,12,30,He detected a failed login attempt on your ba...
2,+1-213-329-9734,22:06:51,445,1,26,"Up to you, you want come then come for... But ..."
3,+1-628-329-7445,21:20:38,243,5,17,My Anna is breaking me out. The's looked thou ...
4,+1-417-861-6252,09:32:34,373,4,16,Key are you angry with me. Reply me dr.


In [13]:
y_resampled.head()

0    Normal
1      Scam
2    Normal
3    Normal
4    Normal
Name: Call Type, dtype: object

In [14]:
X_resampled.dtypes

Caller Number              object
Call Time                  object
Call Duration(in s)         int64
Call Frequency Per Day      int64
Call Frequency Per Week     int64
Conversation               object
dtype: object

In [15]:
X_resampled['Conversation'].head()

0    whiskey Brandy Sum In Beer Vodka Scotch Shampa...
1     He detected a failed login attempt on your ba...
2    Up to you, you want come then come for... But ...
3    My Anna is breaking me out. The's looked thou ...
4              Key are you angry with me. Reply me dr.
Name: Conversation, dtype: object

In [16]:
X_resampled['Conversation'].apply(type).value_counts()


Conversation
<class 'str'>    13221
Name: count, dtype: int64

In [17]:
X_resampled['Conversation'].isnull().sum()

np.int64(0)

In [18]:
X_eval.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-628-877-4923,16:53:24,765,4,25,Purity of friendship between two is not about ...
1,+1-313-528-1651,16:35:33,203,4,6,I know she called me
2,+1-205-693-7634,19:20:07,258,6,10,A just telling at the incident..
3,+1-572-745-1281,21:06:05,166,7,8,He just only have science and fiction books an...
4,+1-572-277-5100,10:16:54,110,4,24,Still i have not checked it da. . .


In [19]:
X_test.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-572-358-7183,17:45:56,138,3,15,Life is more strict than teacher... Because Te...
1,+1-917-998-6266,08:25:24,235,7,37,"Sorry, I'll call later In meeting."
2,+1-323-414-2068,08:01:45,350,4,12,HIYA STU WOT U of 2.of of of MUCH TRUBLE of HO...
3,+1-646-804-3708,20:08:16,148,3,11,He hungry buy some food good let... But sum n ...
4,+1-323-378-4448,09:04:55,66,4,30,"Not yet chink..going to room no, i'm in bus.."


## Pipelining


Function to create two new columns as decided in preprocessing steps i.e Area Code, Hour and Binned Hour values


In [20]:
class ConvertToDatetime(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy() 
        
        if 'Call Time' in X.columns:
            if not pd.api.types.is_datetime64_any_dtype(X['Call Time']):
                X['Call Time'] = pd.to_datetime(X['Call Time'],format='%H:%M:%S', errors='coerce')
        else:
            raise ValueError("The input DataFrame must contain a 'Call Time' column.")
        
        return X

In [21]:
class AddAreaCodeDropNumber(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        
        X['Area Code'] = X['Caller Number'].str[3:6]

        def clean_phone_number(phone_number):
            cleaned_number = re.sub(r'[^0-9]', '', phone_number)
            return cleaned_number[-7:]

        X['PhNoLastDig'] = X['Caller Number'].apply(clean_phone_number)

        X['Area Code'] = X['Area Code'].astype(int)
        X['PhNoLastDig'] = X['PhNoLastDig'].astype(int)
        
        X.drop(columns=['Caller Number'], inplace=True, errors='ignore')
        
        
        return X


In [22]:
class AddHourDropTime(BaseEstimator, TransformerMixin):
    def fit(self,X,y = None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X['Hour'] = X['Call Time'].dt.hour.astype(int)

        X.drop(columns='Call Time', inplace=True,  errors='ignore')

        return X

In [23]:
class TimeOfDayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        if 'Hour' not in X.columns:
            raise ValueError("The input DataFrame must contain an 'Hour' column.")

        X['TimeOfDay'] = pd.cut(X['Hour'], bins=[0, 7, 9, 12, 15, 17, 20, 23], labels=['0-7', '7-9', '9-12', '12-15','15-17','17-20','20-23'], right=False)

        return X

In [24]:
class SqueezeTextColumn(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure X is a pandas DataFrame or Series
        if isinstance(X, pd.DataFrame):
            if X.shape[1] == 1:  # Single-column DataFrame
                X = X.squeeze(axis=1)
            else:
                raise ValueError("Input must be a single-column DataFrame for squeezing.")
        elif not isinstance(X, pd.Series):
            raise TypeError("Input must be a pandas Series or single-column DataFrame.")
        
        return X

In [25]:
class StemmingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.port_stem = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if not isinstance(X, (list, pd.Series)):
            raise ValueError("Input must be a pandas Series or a list of strings.")

        return X.apply(self._stem_content)
    
    def _stem_content(self, content):
        stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
        stemmed_content = stemmed_content.lower()
        stemmed_content = stemmed_content.split()
        stemmed_content = [
            self.port_stem.stem(word) 
            for word in stemmed_content 
            if word not in self.stop_words
        ]
        return ' '.join(stemmed_content)

In [None]:
class FitTransformPipeline(BaseEstimator, ClassifierMixin):
    def __init__(self, preprocessor, model):
        self.preprocessor = preprocessor
        self.model = model 

    def fit(self, X, y):
        X_transformed = self.preprocessor.fit_transform(X, y)
        self.model.fit(X_transformed, y)
        return self

    def predict(self, X):
        X_transformed = self.preprocessor.transform(X)
        return self.model.predict(X_transformed)

    def predict_proba(self, X):
        X_transformed = self.preprocessor.transform(X)
        return self.model.predict_proba(X_transformed)

    def fit_transform(self, X, y):
        return self.preprocessor.fit_transform(X, y)

Pipeline Construction

In [27]:
AddAreaCodeCol = Pipeline(steps=[
    ('Add_AreaCode_Column', AddAreaCodeDropNumber()),
])

In [28]:
AddHourCol = Pipeline(steps=[
    ('ToDateTime', ConvertToDatetime()),
    ('Add_Hour_Column', AddHourDropTime())
])

In [29]:
AddBinnedTimeCol = Pipeline(steps=[
    ('Add_time_binned', TimeOfDayTransformer())
])

In [30]:
time_pipeline = Pipeline(steps=[
    ('hour', AddHourCol),               # Extract 'Hour' from 'Call Time'
    ('binned_time', AddBinnedTimeCol)  # Bin 'Hour' into categories
])

In [31]:
numerical_transformer = Pipeline(steps=[
    ('Imputation_median',SimpleImputer(missing_values=np.nan,strategy='median')),
    ('scaler', StandardScaler())
])

In [32]:
categorical_transformer = Pipeline(steps=[
    ('Text_imputation', SimpleImputer(fill_value='', strategy='constant')),
    ('OneHot',OneHotEncoder())
])

In [33]:
import nltk
nltk.download('stopwords')
text_transformer = Pipeline(steps=[
    ('squeeze',SqueezeTextColumn()),
    ('stemming', StemmingTransformer()),
    ('vectorizer', TfidfVectorizer())  
])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
numerical_cols = ['Call Duration(in s)','Call Frequency Per Day','Call Frequency Per Week','Area Code','PhNoLastDig','Hour']
categorical_cols = ['TimeOfDay']
text_cols = ['Conversation']

Main Pipeline

In [35]:
add_data = Pipeline(steps=[
    ('AreaCode',AddAreaCodeCol),
    ('Time', time_pipeline) #Creates three new columns AreaCode, LastPhNo and TimeOfDay
])

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('text', text_transformer, text_cols)
    ]
)

In [37]:
data_pipeline = Pipeline(steps=[
    ('appendandremovecol', add_data),
    ('preprocessor', preprocessor)    
])


In [38]:
final_pipeline = FitTransformPipeline(preprocessor=data_pipeline, model=RandomForestClassifier())

In [39]:
X_preprocessed = data_pipeline.fit_transform(X_resampled)
X_preprocessed.shape

(13221, 5175)

In [40]:
# numerical_data = numerical_transformer.fit_transform(X_resampled[numerical_cols])
# print("Numerical data shape:", numerical_data.shape)

In [41]:
# categorical_data = categorical_transformer.fit_transform(X_resampled[categorical_cols])
# print("Categorical data shape:", categorical_data.shape)

In [42]:
set_config(display="diagram")

In [43]:
final_pipeline.fit(X_resampled,y_resampled)

In [44]:
y_pred_train = final_pipeline.predict(X_resampled)

In [45]:
f1_train = f1_score(y_resampled, y_pred_train,average='macro')
accuracy_train = accuracy_score(y_resampled, y_pred_train)

In [46]:
print(f"F1 Scores of Training Data: {f1_train:.4f}")
print(f"Accuracy Scores of Training Data: {accuracy_train:.4f}")

F1 Scores of Training Data: 1.0000
Accuracy Scores of Training Data: 1.0000


In [47]:
y_pred_eval = final_pipeline.predict(X_eval)

In [48]:
f1_eval = f1_score(y_eval, y_pred_eval,average='macro')
accuracy_eval = accuracy_score(y_eval, y_pred_eval)

In [49]:
print(f"F1 Scores of Eval data: {f1_eval:.4f}")
print(f"Accuracy Scores of Eval data: {accuracy_eval:.4f}")

F1 Scores of Eval data: 0.9624
Accuracy Scores of Eval data: 0.9892


In [50]:
X_test.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-572-358-7183,17:45:56,138,3,15,Life is more strict than teacher... Because Te...
1,+1-917-998-6266,08:25:24,235,7,37,"Sorry, I'll call later In meeting."
2,+1-323-414-2068,08:01:45,350,4,12,HIYA STU WOT U of 2.of of of MUCH TRUBLE of HO...
3,+1-646-804-3708,20:08:16,148,3,11,He hungry buy some food good let... But sum n ...
4,+1-323-378-4448,09:04:55,66,4,30,"Not yet chink..going to room no, i'm in bus.."


In [51]:
y_pred_test = final_pipeline.predict(X_test)

In [52]:
f1_test = f1_score(y_test, y_pred_test,average='macro')
accuracy_test = accuracy_score(y_test, y_pred_test)

In [53]:
print(f"F1 Scores of Test Data: {f1_test:.4f}")
print(f"Accuracy Scores of Test Data: {accuracy_test:.4f}")

F1 Scores of Test Data: 0.9394
Accuracy Scores of Test Data: 0.9872


In [54]:
features = np.array([['+1-213-543-7293','16:05:12',415,4,20,'Or then ü bring war... Ilya later i come n c war... May ü neve set properly ü got da help sheet if ü...']])
column_names = ['Caller Number', 'Call Time', 'Call Duration(in s)', 'Call Frequency Per Day', 'Call Frequency Per Week', 'Conversation']
final_features = pd.DataFrame(features, columns=column_names)
final_pipeline.predict(final_features)

array(['Normal'], dtype=object)

In [55]:
joblib.dump(final_pipeline, 'model_pipeline.joblib')
print("Model saved successfully.")

Model saved successfully.


In [56]:
loaded_pipeline = joblib.load('model_pipeline.joblib')
print("Model loaded successfully.")

Model loaded successfully.


In [57]:
features_new = pd.DataFrame(
    [['+1-213-362-2258', '18:14:35', 96, 10, 52, 'Spam', 
      'Santa Calling! Would your little ones like a call from Santa Was eve? All 09058094583 to book your time.']],
    columns=['Caller Number', 'Call Time', 'Call Duration(in s)', 
             'Call Frequency Per Day', 'Call Frequency Per Week', 
             'Category', 'Conversation']
)
predictions = loaded_pipeline.predict(features_new)
print("Predictions:", predictions)

Predictions: ['Spam']


In [58]:
features_1 = pd.DataFrame([[
    '+1-718-693-2776', '09:25:37', 1090, 12, 30, "hello sir, i am from si bank, sir i have call you for your bank account purification do you want verify your account. sir then tell me your zakhar card number and pan card and your debit/credit card number."  ]], 
    columns=['Caller Number', 'Call Time', 'Call Duration(in s)', 'Call Frequency Per Day', 'Call Frequency Per Week', 'Conversation']
)
predictions_1 = loaded_pipeline.predict(features_1)
print("Predictions:", predictions_1)

Predictions: ['Scam']
