In [1]:
import pandas as pd
import numpy as np 
import spacy

In [2]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [3]:
df.head(5)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [9]:
X_train_vec = count_vect.fit_transform(X_train)

In [10]:
X_train_vec.shape

(3733, 7082)

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_vec)
X_train_tfidf.shape

(3733, 7082)

In [17]:
print(X_train_tfidf)

  (0, 7069)	0.6019702680143677
  (0, 4415)	0.35852876712053044
  (0, 1736)	0.7135046738275388
  (1, 7048)	0.06220835924135395
  (1, 6330)	0.12175375951774331
  (1, 6250)	0.18699157878309497
  (1, 6247)	0.12959378252338027
  (1, 6219)	0.07434593703652681
  (1, 5791)	0.236698937005449
  (1, 5512)	0.15094660409003707
  (1, 5468)	0.236698937005449
  (1, 5443)	0.22545044092015623
  (1, 5437)	0.22545044092015623
  (1, 5436)	0.17424313877010147
  (1, 5243)	0.22545044092015623
  (1, 4519)	0.1982400748683877
  (1, 4518)	0.12552667891184655
  (1, 4513)	0.09521739789951615
  (1, 4489)	0.236698937005449
  (1, 4470)	0.0931011308331512
  (1, 4270)	0.08916256276356054
  (1, 4018)	0.08517903405827136
  (1, 3726)	0.20194453011537272
  (1, 3501)	0.2062210098516256
  (1, 3416)	0.08402534579064598
  :	:
  (3728, 2090)	0.220194057121089
  (3728, 1440)	0.19745041735266144
  (3728, 1080)	0.2074000350927176
  (3729, 5795)	0.5474874445685368
  (3729, 3794)	0.4832600441125685
  (3729, 3674)	0.5570538854722596
 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tfidf_vector = TfidfVectorizer()

In [20]:
vectorized_train = tfidf_vector.fit_transform(X_train)

In [23]:
print(vectorized_train)

  (0, 1736)	0.7135046738275388
  (0, 4415)	0.35852876712053044
  (0, 7069)	0.6019702680143677
  (1, 4519)	0.19824007486838768
  (1, 2472)	0.2174695059369183
  (1, 3726)	0.2019445301153727
  (1, 6219)	0.0743459370365268
  (1, 4018)	0.08517903405827133
  (1, 4518)	0.12552667891184652
  (1, 3416)	0.08402534579064597
  (1, 5436)	0.17424313877010142
  (1, 3116)	0.13652411260636216
  (1, 4489)	0.23669893700544894
  (1, 3501)	0.20622100985162556
  (1, 1797)	0.15290569110510804
  (1, 849)	0.15094660409003705
  (1, 1835)	0.1625121573959421
  (1, 4513)	0.09521739789951614
  (1, 5243)	0.2254504409201562
  (1, 938)	0.23669893700544894
  (1, 4470)	0.09310113083315118
  (1, 6250)	0.1869915787830949
  (1, 957)	0.09696276613143018
  (1, 7048)	0.062208359241353935
  (1, 3280)	0.10494370111466803
  :	:
  (3728, 2926)	0.1855922660561245
  (3728, 7048)	0.10087994978401515
  (3728, 3280)	0.17018155482163447
  (3729, 3674)	0.5570538854722596
  (3729, 3794)	0.4832600441125685
  (3729, 5795)	0.547487444568536

In [24]:
from sklearn.svm import LinearSVC

In [25]:
model = LinearSVC()

In [26]:
model.fit(vectorized_train,y_train)



In [43]:
X_train.shape

(3733,)

In [34]:
vectorized_X_test = tfidf_vector.fit_transform(X_test)

In [41]:
# print(vectorized_X_test)
vectorized_X_test.shape

(1839, 4695)

In [44]:
# prediction = model.predict(vectorized_X_test)

In [45]:
from sklearn.pipeline import Pipeline

In [46]:
sets = [('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())]

In [47]:
pipeline = Pipeline(sets)

In [48]:
pipeline.fit(X_train,y_train)



In [49]:
prediction = pipeline.predict(X_test)

In [50]:
from sklearn.metrics import classification_report, accuracy_score

In [52]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [53]:
#  Check if we can achieve without pipelince

In [54]:
X_test_count = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_count)

In [58]:
pred = model.predict(X_test_tfidf)

In [59]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [60]:
X_test_tfidf_vect = tfidf_vector.transform(X_test)

In [62]:
X_test_tfidf_vect.shape

(1839, 4695)

In [63]:
x = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# Create a TF-IDF vectorizer and fit-transform the training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Display the shape of the TF-IDF matrix
print("Shape of X_train_tfidf:", X_train_tfidf.shape)

# Transform the test data using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train the Linear Support Vector Classifier
model = LinearSVC()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.2f}")

Shape of X_train_tfidf: (3733, 7082)
Accuracy on the test set: 0.99




In [66]:
pipeline.predict(['Congratulations you have won a big bonus, to claim please click on the link'])

array(['spam'], dtype=object)