In [21]:
import pandas as pd 
import numpy as np
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter
from collections import defaultdict

In [22]:
data = pd.read_csv('spam.csv')

## Data Cleaning and Prepocessing

In [23]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f'[{string.punctuation}]', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in ENGLISH_STOP_WORDS)
    return text

# Define the remove stop words function
def remove_stop_words(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
    return ' '.join(filtered_tokens)

# Apply the cleaning functions
data['Message'] = data['Message'].apply(clean_text).apply(remove_stop_words)

print(data.head())


  Category                                            Message
0      ham  jurong point crazy available bugis n great wor...
1      ham                            ok lar joking wif u oni
2     spam  free entry 2 wkly comp win fa cup final tkts 2...
3      ham                        u dun say early hor u c say
4      ham                      nah dont think goes usf lives


## TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = data['Message']
y = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [28]:
feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=feature_names)

# Display the first few rows of the DataFrame
print(tfidf_df.head())

   020603  0207  02073162414  020903  0578  071104  07123456789  07734396839  \
0     0.0   0.0          0.0     0.0   0.0     0.0          0.0          0.0   
1     0.0   0.0          0.0     0.0   0.0     0.0          0.0          0.0   
2     0.0   0.0          0.0     0.0   0.0     0.0          0.0          0.0   
3     0.0   0.0          0.0     0.0   0.0     0.0          0.0          0.0   
4     0.0   0.0          0.0     0.0   0.0     0.0          0.0          0.0   

   0776xxxxxxx  07786200117  ...  youve   yr  yrs  yummy  yun  yuo  yup  zed  \
0          0.0          0.0  ...    0.0  0.0  0.0    0.0  0.0  0.0  0.0  0.0   
1          0.0          0.0  ...    0.0  0.0  0.0    0.0  0.0  0.0  0.0  0.0   
2          0.0          0.0  ...    0.0  0.0  0.0    0.0  0.0  0.0  0.0  0.0   
3          0.0          0.0  ...    0.0  0.0  0.0    0.0  0.0  0.0  0.0  0.0   
4          0.0          0.0  ...    0.0  0.0  0.0    0.0  0.0  0.0  0.0  0.0   

   zoe  üll  
0  0.0  0.0  
1  0.0  0.

In [30]:
#Analyzing Non-zero Features
document_vector = tfidf_df.iloc[0]
non_zero_features = document_vector[document_vector > 0]
print(non_zero_features.sort_values(ascending=False))

held       0.389274
87239      0.365328
cup        0.349554
weekly     0.293788
world      0.284566
end        0.281332
100        0.275464
service    0.261294
win        0.252072
reply      0.219999
stop       0.219999
send       0.204650
Name: 0, dtype: float64
