In [1]:
# from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [3]:
# read and pre process the data set
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        match data[:3]:
            case 'ham':
                label = 'legitimate'
                sample = data[4:] 
            case 'spa':
                label = 'spam'
                sample = data[5:] 
            case _:
                label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label


# extract feature from SMS
def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()


# decrece the dimension
def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

# select the feature 
def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [4]:
# read data 
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [5]:
# extract feature 
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels (convert label to 0 or 1)
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [6]:
# feature extraxtion
records_dim_reduced[:5]

array([[-1.85636052,  0.2854707 , -1.18447927,  0.82078356,  0.71619143],
       [-2.7839942 ,  0.52156408, -1.74242901,  0.50236524, -0.73609322],
       [ 0.48307956, -0.03166082,  2.01336683, -6.53598307,  1.00753833],
       [-1.83558037,  1.14050545, -3.93211865, -0.18479677, -1.99163756],
       [ 0.27694587, -0.77394624,  0.11487932,  1.32002639, -0.75353337]])

In [10]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

# select the important feature ( data set have many feture , but this code select the some feture in each run)
records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [17]:
## for better visualization
# feature selection
pd.DataFrame(records_selection, columns=feature_name_selection).head()

Unnamed: 0,call,free,i,txt,www
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,4.187968,0.0,4.51406,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.992194,0.0,0.0


In [9]:
## TODO: build a fuzzy rule-based model for (records, label)
