In [78]:
import pandas as pd
import re
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
df = pd.read_csv('/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [83]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [84]:
df.info

<bound method DataFrame.info of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [85]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [86]:
df['Category'] = df['Category'].apply(lambda x:1 if x == 'spam' else 0)
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [87]:
def remove_hyperlink(text):
  return re.sub(r"http\S+","",text)

def to_lower(text):
  result = text.lower()
  return result

def remove_punctuation(text):
  result = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))
  return result

def remove_whitespace(text):
  result = text.strip()
  return result

def replace_newline(text):
  return text.replace('\n',"")

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.casefold() not in stopwords]
    return ' '.join(filtered_words)

def clean(sentence):
  pipe = [remove_hyperlink,
          to_lower,
          remove_punctuation,
          remove_whitespace,
          replace_newline,
          remove_stopwords
          ]

  for i in pipe:
    sentence = i(sentence)
  return sentence



In [88]:
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [89]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

# Create a word count vectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_count = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_count = vectorizer.transform(X_test)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_count, y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test_count)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9865470852017937


In [90]:
X_test_count

<1115x7701 sparse matrix of type '<class 'numpy.int64'>'
	with 13741 stored elements in Compressed Sparse Row format>

In [91]:
new_email = input('Enter an email: ')

Enter an email: Hi Data Pros,  ​SodaGPT -- The first generative AI for data quality has LAUNCHED!  Introducing SodaGPT, the no-code approach to self-serve data quality testing. SodaGPT lets any user translate natural English into production-ready SodaCL data quality checks. Make sure your data is valid, fresh, and fit for purpose just by asking nicely.


In [92]:
new_data = clean(new_email)
data_count = vectorizer.transform([new_data])
pred = classifier.predict(data_count)
if pred == 1:
    print("The email is spam.")
else:
    print("The email is not spam.")

The email is not spam.


In [77]:
data_count

<1x7701 sparse matrix of type '<class 'numpy.int64'>'
	with 28 stored elements in Compressed Sparse Row format>