In [1]:
# from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [3]:
# read and pre process the data set
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        data_type = data[:3]
        if data_type == 'ham':
            label = 'legitimate'
            sample = data[4:] 
        elif data_type == 'spa':
            label = 'spam'
            sample = data[5:] 
        else:
            label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label
# extract feature from SMS
def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()


# decrece the dimension
def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

# select the feature 
def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [4]:
# read data 
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [5]:
# extract feature 
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels (convert label to 0 or 1)
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [6]:
# feature extraxtion
records_dim_reduced[:5]

array([[-1.85805084,  0.27615314, -1.18714228,  0.81603176,  0.72641897],
       [-2.78411056,  0.50266646, -1.74897742,  0.50690102, -0.74297103],
       [ 0.48739438, -0.01476036,  2.01796402, -6.52155914,  0.99101557],
       [-1.83687688,  1.102353  , -3.93619492, -0.18240574, -2.0030695 ],
       [ 0.27471883, -0.7762637 ,  0.09608273,  1.30213264, -0.79091957]])

In [10]:
info = []
for i in records_dim_reduced:
    info.append([np.std(i), np.mean(i)])
info = pd.DataFrame(info, columns=["std", "mean"])

info


Unnamed: 0,std,mean
0,1.079871,-0.245318
1,1.283061,-0.853298
2,3.032311,-0.607989
3,1.716217,-1.371239
4,0.775391,0.021150
...,...,...
5569,3.049917,-1.540793
5570,0.943773,-0.115412
5571,0.624204,0.074031
5572,1.821245,0.725202


In [12]:
info.describe()

Unnamed: 0,std,mean
count,5574.0,5574.0
mean,1.521393,3.360645e-16
std,1.464705,1.055984
min,0.164556,-5.446162
25%,0.884471,-0.3999221
50%,1.145208,-0.1647461
75%,1.734915,0.1724217
max,34.237967,34.27104


In [14]:
def isosceles_triangular_fuzzy(x, m, s):
    return np.max(np.min((x - m) / s, (m - x) / s), 0)
def trapezoids_of_Malzawie_fuzzy(x, m, s):
    return np.max(np.min((x - m) / s, 1), 0)
def gaussian_fuzzy(x, m ,s):
    return np.exp(-0.5 * ((x - m) / s) ** 2)
def sigmoid_fuzzy(x, m ,s):
    return 1 / (1 + np.exp((x - m) / s))
def Fc(R,numberOfrule):
    
    
def Fneg(Ruls,numberOfrule):
    for i in range(numberOfrule):
        pass


In [10]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

# select the important feature ( data set have many feture , but this code select the some feture in each run)
records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [17]:
## for better visualization
# feature selection
pd.DataFrame(records_selection, columns=feature_name_selection).head()

Unnamed: 0,call,free,i,txt,www
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,4.187968,0.0,4.51406,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.992194,0.0,0.0


In [9]:
## TODO: build a fuzzy rule-based model for (records, label)
