In [442]:
import numpy as np
import pandas as pd
import re
# from tqdm import tqdm
# from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer


In [443]:
c_fraud_df = pd.read_csv("corrected_names.csv")
c_fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17590 entries, 0 to 17589
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Caller Number            17590 non-null  object
 1   Call Time                17590 non-null  object
 2   Call Duration(in s)      17590 non-null  int64 
 3   Call Frequency Per Day   17590 non-null  int64 
 4   Call Frequency Per Week  17590 non-null  int64 
 5   Call Type                17590 non-null  object
 6   Conversation             17590 non-null  object
dtypes: int64(3), object(4)
memory usage: 962.1+ KB


In [444]:
c_fraud_df['Call Time'] = pd.to_datetime(c_fraud_df['Call Time'], format='%H:%M:%S')
c_fraud_df.dtypes

Caller Number                      object
Call Time                  datetime64[ns]
Call Duration(in s)                 int64
Call Frequency Per Day              int64
Call Frequency Per Week             int64
Call Type                          object
Conversation                       object
dtype: object

In [445]:
c_fraud_df.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Call Type,Conversation
0,+1-917-555-3286,1900-01-01 08:50:33,62,3,25,Normal,"Hope. Since ago travelled, he has forgotten hi..."
1,+1-205-795-8028,1900-01-01 17:05:53,78,3,31,Normal,Horrible you eat mass eat until u forgot about...
2,+1-702-412-9422,1900-01-01 16:29:51,55,4,7,Normal,Great! I have to run now so tell!
3,+1-915-426-2998,1900-01-01 17:36:33,48,9,14,Normal,"Sorry, I'll call later"
4,+1-915-543-9437,1900-01-01 20:19:36,262,3,16,Normal,"May wants to work out first, how's 4 sound?"


## Splitting the data

In [446]:
from sklearn.model_selection import train_test_split

def split_data_stratified(X, y, test_size=0.2, eval_size=0.1, random_state=42):
    
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=test_size + eval_size, random_state=random_state, stratify=y)
    
    X_eval, X_test, y_eval, y_test = train_test_split(
        X_temp, y_temp, test_size=test_size / (test_size + eval_size), random_state=random_state, stratify=y_temp)
    
    del X_temp,y_temp
    
    return X_train, X_eval, X_test, y_train, y_eval, y_test


In [447]:
#Dividing columns to features and labels
y = c_fraud_df['Call Type']
X = c_fraud_df.drop(columns='Call Type', axis=1)

In [448]:
X_train, X_eval, X_test, y_train, y_eval, y_test = split_data_stratified(X,y)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [449]:
X_train.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-917-436-9480,1900-01-01 12:02:10,1325,5,10,whiskey Brandy Sum In Beer Vodka Scotch Shampa...
1,+1-718-745-1135,1900-01-01 12:05:16,1572,12,30,He detected a failed login attempt on your ba...
2,+1-213-329-9734,1900-01-01 22:06:51,445,1,26,"Up to you, you want come then come for... But ..."
3,+1-628-329-7445,1900-01-01 21:20:38,243,5,17,My Anna is breaking me out. The's looked thou ...
4,+1-417-861-6252,1900-01-01 09:32:34,373,4,16,Key are you angry with me. Reply me dr.


In [450]:
X_train.dtypes

Caller Number                      object
Call Time                  datetime64[ns]
Call Duration(in s)                 int64
Call Frequency Per Day              int64
Call Frequency Per Week             int64
Conversation                       object
dtype: object

## Oversampling Data to handle data imbalance

In [451]:
a = y_train.eq('Spam').sum()
b = y_train.eq('Scam').sum()
print(a)
print(b)

1206
325


In [452]:
rsa = RandomOverSampler( sampling_strategy={'Spam': 1757,'Scam':683},random_state=42)

X_resampled, y_resampled = rsa.fit_resample(X_train, y_train)

In [453]:
X_resampled.head()

Unnamed: 0,Caller Number,Call Time,Call Duration(in s),Call Frequency Per Day,Call Frequency Per Week,Conversation
0,+1-917-436-9480,1900-01-01 12:02:10,1325,5,10,whiskey Brandy Sum In Beer Vodka Scotch Shampa...
1,+1-718-745-1135,1900-01-01 12:05:16,1572,12,30,He detected a failed login attempt on your ba...
2,+1-213-329-9734,1900-01-01 22:06:51,445,1,26,"Up to you, you want come then come for... But ..."
3,+1-628-329-7445,1900-01-01 21:20:38,243,5,17,My Anna is breaking me out. The's looked thou ...
4,+1-417-861-6252,1900-01-01 09:32:34,373,4,16,Key are you angry with me. Reply me dr.


In [454]:
y_resampled.head()

0    Normal
1      Scam
2    Normal
3    Normal
4    Normal
Name: Call Type, dtype: object

## Pipelining


Function to create two new columns as decided in preprocessing steps i.e Area Code, Hour and Binned Hour values


In [455]:
class AddAreaCodeDropNumber(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure X is a DataFrame
        X = X.copy()
        
        X['Area Code'] = X['Caller Number'].str[3:6]

        def clean_phone_number(phone_number):
            cleaned_number = re.sub(r'[^0-9]', '', phone_number)
            return cleaned_number[-7:]

        X['PhNoLastDig'] = X['Caller Number'].apply(clean_phone_number)

        X['Area Code'] = X['Area Code'].astype(int)
        X['PhNoLastDig'] = X['PhNoLastDig'].astype(int)
        
        X.drop(columns=['Caller Number'], inplace=True)
        
        
        return X


In [456]:
class AddHourDropTime(BaseEstimator, TransformerMixin):
    def fit(self,X,y = None):
        return self
    
    def transform(self, X):
        X = X.copy()

        X['Hour'] = X['Call Time'].dt.hour.astype(int)

        X.drop(columns='Call Time', inplace=True)

        return X

In [457]:
class TimeOfDayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Hour' not in X.columns:
            raise ValueError("The input DataFrame must contain an 'Hour' column.")

        X['TimeOfDay'] = pd.cut(X['Hour'], bins=[0, 7, 9, 12, 15, 17, 20, 23], labels=['0-7', '7-9', '9-12', '12-15','15-17','17-20','20-23'], right=False)

        

        return X

Function to split dataset into numerical and categorical datasets to build separate pipelines

In [458]:
numerical_columns_df = X.select_dtypes(include=['number'])
categorical_columns_df = X.select_dtypes(include=['category', 'bool'])
text_columns_df = X.select_dtypes(include=['object'])

In [459]:
numerical_transformer = Pipeline(steps=[
    ('Imputation_median',SimpleImputer(missing_values=np.nan,strategy='median')),
    ('scaler', StandardScaler())
])

In [460]:
categorical_transformer = Pipeline(steps=[
    ('Text_imputation', SimpleImputer(fill_value='', strategy='constant')),
    ('OneHot',OneHotEncoder())
])

In [461]:
text_transformer = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer())  # Convert text data into TF-IDF features
])

In [None]:
final_pipeline = Pipeline(steps=[
    ('Add_AreaCode_Column', AddAreaCodeDropNumber()),
    ('Add_Hour_Column', AddHourDropTime()),
    ('Add_time_binned', TimeOfDayTransformer()),
])

In [463]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns_df),
        ('cat', categorical_transformer, categorical_columns_df),
        ('text', text_transformer, text_columns_df)
    ]
)

In [464]:
# X_transformed = final_pipeline.fit_transform(X_resampled)
# X_transformed_df = pd.DataFrame(X_transformed)
# X_transformed_df.head()

In [465]:
# X_transformed.dtypes

In [466]:
set_config(display="diagram")