In [1]:
# from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [3]:
# read and pre process the data set
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        data_type = data[:3]
        if data_type == 'ham':
            label = 'legitimate'
            sample = data[4:] 
        elif data_type == 'spa':
            label = 'spam'
            sample = data[5:] 
        else:
            label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label
# extract feature from SMS
def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()


# decrece the dimension
def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

# select the feature 
def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [4]:
# read data 
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [5]:
# extract feature 
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels (convert label to 0 or 1)
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [6]:
# feature extraxtion
records_dim_reduced[:5]

array([[-1.85632519,  0.28319004, -1.18534515,  0.83016254,  0.7252797 ],
       [-2.78398453,  0.52105391, -1.74256448,  0.50329886, -0.73155797],
       [ 0.48303433, -0.03795902,  2.00931372, -6.52083596,  0.96973512],
       [-1.83556457,  1.1391571 , -3.93225827, -0.18171667, -1.97877552],
       [ 0.27686514, -0.77516961,  0.11315671,  1.31308522, -0.80324805]])

In [None]:
info = []
for i in records_dim_reduced:
    info.append([np.std(i), np.mean(i)])
info = pd.DataFrame(info, columns=["std", "mean"])

info


In [None]:
info.describe()

In [None]:
def isosceles_triangular_fuzzy(x, m, s):
    return np.max(np.min((x - m) / s, (m - x) / s), 0)
def trapezoids_of_Malzawie_fuzzy(x, m, s):
    return np.max(np.min((x - m) / s, 1), 0)
def gaussian_fuzzy(x, m ,s):
    return np.exp(-0.5 * ((x - m) / s) ** 2)
def sigmoid_fuzzy(x, m ,s):
    return 1 / (1 + np.exp((x - m) / s))
# def Fc(R,numberOfrule):
    
    
# def Fneg(Ruls,numberOfrule):
#     for i in range(numberOfrule):
#         pass

def gR(xP, muA):
    result = 1
    for i in range(len(xP)):
        result *= muA[i] * xP[i]
    return result

def gC(LOFL_xP, LOFL_muA, numberOfrule):  #  nothing
    result = 0
    for i in range(len(LOFL_xP)):
        result += gR(LOFL_xP[i], LOFL_muA[i])
    return result


def Fc(LOFL_xP, LOFL_muA, numberOfrule):  #
    result = 0
    for i in range(len(LOFL_xP)):
        result += gR(LOFL_xP[i], LOFL_muA[i])
    return result

def Fneg(LOFL_xP, LOFL_xP, number_another_rule):   # r = number_another_rule
    return (1/r) * (Fc(LOFL_xP, LOFL_muA, number_another_rule))
    


In [None]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

# select the important feature ( data set have many feture , but this code select the some feture in each run)
records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [None]:
## for better visualization
# feature selection
pd.DataFrame(records_selection, columns=feature_name_selection).head()

In [None]:
## TODO: build a fuzzy rule-based model for (records, label)
