In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/OmkarPathak/Playing-with-datasets/master/Email%20Spam%20Filtering/emails.csv")

In [4]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
df.shape

(5728, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [7]:
df["spam"].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [8]:
df["text"][4]

"Subject: do not have money , get software cds from here !  software compatibility . . . . ain ' t it great ?  grow old along with me the best is yet to be .  all tradgedies are finish ' d by death . all comedies are ended by marriage ."

In [9]:
def remove_first_words(sentence, num_words=1):
    words = sentence.split()
    if len(words) > num_words:
        return ' '.join(words[num_words:])
    else:
        return ""

df['modified_text'] = df['text'].apply(remove_first_words)

In [41]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [10]:
df["modified_text"][4]

"do not have money , get software cds from here ! software compatibility . . . . ain ' t it great ? grow old along with me the best is yet to be . all tradgedies are finish ' d by death . all comedies are ended by marriage ."

In [12]:
del df["text"]

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def preprocess_sentence(sentence):
    # Tokenize the sentence into words
    words = word_tokenize(sentence)

    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get the list of English stop words
    stop_words = set(stopwords.words("english"))

    # Remove stop words and lemmatize remaining words
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]

    # Join the cleaned words back into a sentence
    cleaned_sentence = ' '.join(cleaned_words)

    return cleaned_sentence

# Test the function
df['modified1_text'] = df['modified_text'].apply(remove_first_words)

In [15]:
df

Unnamed: 0,spam,modified_text,modified1_text
0,1,naturally irresistible your corporate identity...,irresistible your corporate identity lt is rea...
1,1,the stock trading gunslinger fanny is merrill ...,stock trading gunslinger fanny is merrill but ...
2,1,unbelievable new homes made easy im wanting to...,new homes made easy im wanting to show you thi...
3,1,4 color printing special request additional in...,color printing special request additional info...
4,1,"do not have money , get software cds from here...","not have money , get software cds from here ! ..."
...,...,...,...
5723,0,re : research and development charges to gpg h...,: research and development charges to gpg here...
5724,0,"re : receipts from visit jim , thanks again fo...",": receipts from visit jim , thanks again for t..."
5725,0,re : enron case study update wow ! all on the ...,: enron case study update wow ! all on the sam...
5726,0,"re : interest david , please , call shirley cr...",": interest david , please , call shirley crens..."


In [16]:
del df["modified_text"]

In [17]:
df["modified1_text"][5722]

': vacation vince : i just found out that it is friday , april 7 and not friday , march 31 st that i want to take for vacation . is this alright ? thanks ! shirley vince j kaminski 03 / 08 / 2000 06 : 18 pm to : shirley crenshaw / hou / ect @ ect cc : subject : re : vacation shirley , no problem . vince shirley crenshaw 03 / 08 / 2000 03 : 56 pm to : vince j kaminski / hou / ect @ ect cc : kevin g moore / hou / ect @ ect , william smith / corp / enron @ enron subject : vacation vince : i would like to take the following days as vacation : wednesday , march 15 th friday , march 31 st . please let me know if this is ok with you . thanks ! shirley'

In [18]:
import pandas as pd

def calculate_text_features(text):
    words = text.split()

    # Character Counts
    char_count = len(text)

    # Word Count
    word_count = len(words)

    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0

    return pd.Series([char_count, word_count, avg_word_length],
                     index=['char_count', 'word_count', 'avg_word_length'])

# Test the function

data = [calculate_text_features(i) for i in df["modified1_text"]]
df1 = pd.DataFrame(data)

print(df1)


      char_count  word_count  avg_word_length
0         1447.0       322.0         3.496894
1          581.0        87.0         5.689655
2          417.0        85.0         3.917647
3          482.0        96.0         4.031250
4          220.0        50.0         3.420000
...          ...         ...              ...
5723      1151.0       295.0         2.905085
5724      1121.0       242.0         3.636364
5725      2043.0       513.0         2.984405
5726      1012.0       274.0         2.697080
5727      2269.0       442.0         4.135747

[5728 rows x 3 columns]


In [19]:
new_df = pd.concat([df , df1] , axis = 1)

In [20]:
new_df.head()

Unnamed: 0,spam,modified1_text,char_count,word_count,avg_word_length
0,1,irresistible your corporate identity lt is rea...,1447.0,322.0,3.496894
1,1,stock trading gunslinger fanny is merrill but ...,581.0,87.0,5.689655
2,1,new homes made easy im wanting to show you thi...,417.0,85.0,3.917647
3,1,color printing special request additional info...,482.0,96.0,4.03125
4,1,"not have money , get software cds from here ! ...",220.0,50.0,3.42


In [21]:
import pandas as pd

def count_spam_symbols(text):
    spam_symbols = ['!', '$', '%', '*', '?', '+', '#']
    symbol_count = {symbol: text.count(symbol) for symbol in spam_symbols}
    return pd.Series(symbol_count)

# Assuming you have a DataFrame named new_df with a "modified1_text" column
new_df["modified1_text"] = new_df["modified1_text"].astype(str)  # Ensure the column is treated as strings

# Apply the count_spam_symbols function and join the results back to the DataFrame
symbol_count_df = new_df["modified1_text"].apply(count_spam_symbols)
new_df = new_df.join(symbol_count_df)

In [22]:
new_df.head()

Unnamed: 0,spam,modified1_text,char_count,word_count,avg_word_length,!,$,%,*,?,+,#
0,1,irresistible your corporate identity lt is rea...,1447.0,322.0,3.496894,0,0,1,0,0,0,0
1,1,stock trading gunslinger fanny is merrill but ...,581.0,87.0,5.689655,0,0,0,0,0,0,0
2,1,new homes made easy im wanting to show you thi...,417.0,85.0,3.917647,0,1,0,0,0,0,0
3,1,color printing special request additional info...,482.0,96.0,4.03125,2,0,0,0,0,0,0
4,1,"not have money , get software cds from here ! ...",220.0,50.0,3.42,1,0,0,0,1,0,0


In [23]:
import pandas as pd

def count_common_spam_words(text, common_spam_words):
    word_count = {word: text.lower().split().count(word) for word in common_spam_words}
    return pd.Series(word_count)

# List of common spam words
common_spam_words = [
    'free', 'urgent', 'limited', 'offer', 'discount', 'click', 'money',
    'win', 'prize', 'congratulations', 'exclusive', 'amazing', 'guaranteed',
    'opportunity', 'act now', 'buy', 'sale', 'cash', 'earn', 'income',
    'credit', 'loan', 'investment', 'million', 'billion', 'order', 'online',
    'deal', 'save', 'best price', 'cheap', 'satisfaction', 'hidden', 'risk-free',
    'promise', 'unlimited', 'extra', 'bonus', 'gift', 'trial', 'you have been selected'
]

# Assuming you have a DataFrame named new_df with a "modified1_text" column
new_df["modified1_text"] = new_df["modified1_text"].astype(str)  # Ensure the column is treated as strings

# Apply the count_common_spam_words function and join the results back to the DataFrame
word_count_df = new_df["modified1_text"].apply(count_common_spam_words, common_spam_words=common_spam_words)
new_df = new_df.join(word_count_df)

print(new_df)


      spam                                     modified1_text  char_count  \
0        1  irresistible your corporate identity lt is rea...      1447.0   
1        1  stock trading gunslinger fanny is merrill but ...       581.0   
2        1  new homes made easy im wanting to show you thi...       417.0   
3        1  color printing special request additional info...       482.0   
4        1  not have money , get software cds from here ! ...       220.0   
...    ...                                                ...         ...   
5723     0  : research and development charges to gpg here...      1151.0   
5724     0  : receipts from visit jim , thanks again for t...      1121.0   
5725     0  : enron case study update wow ! all on the sam...      2043.0   
5726     0  : interest david , please , call shirley crens...      1012.0   
5727     0  : aurora 5 . 2 update aurora version 5 . 2 - t...      2269.0   

      word_count  avg_word_length  !  $  %  *  ?  ...  satisfaction  hidden

In [24]:
new_df.head()

Unnamed: 0,spam,modified1_text,char_count,word_count,avg_word_length,!,$,%,*,?,...,satisfaction,hidden,risk-free,promise,unlimited,extra,bonus,gift,trial,you have been selected
0,1,irresistible your corporate identity lt is rea...,1447.0,322.0,3.496894,0,0,1,0,0,...,1,0,0,2,1,1,0,0,0,0
1,1,stock trading gunslinger fanny is merrill but ...,581.0,87.0,5.689655,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,new homes made easy im wanting to show you thi...,417.0,85.0,3.917647,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,color printing special request additional info...,482.0,96.0,4.03125,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,"not have money , get software cds from here ! ...",220.0,50.0,3.42,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [26]:
vectorizer = TfidfVectorizer(max_features = 5000)

# Fit and transform the text data using TF-IDF
tfidf_matrix = vectorizer.fit_transform(new_df['modified1_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the existing DataFrame
new_df = pd.concat([new_df, tfidf_df], axis=1)

new_df

Unnamed: 0,spam,modified1_text,char_count,word_count,avg_word_length,!,$,%,*,?,...,zadorozhny,ze,zero,zhang,zhendong,zimin,zip,ziplip,zipter,zone
0,1,irresistible your corporate identity lt is rea...,1447.0,322.0,3.496894,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,stock trading gunslinger fanny is merrill but ...,581.0,87.0,5.689655,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,new homes made easy im wanting to show you thi...,417.0,85.0,3.917647,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,color printing special request additional info...,482.0,96.0,4.031250,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,"not have money , get software cds from here ! ...",220.0,50.0,3.420000,1,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,0,: research and development charges to gpg here...,1151.0,295.0,2.905085,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5724,0,": receipts from visit jim , thanks again for t...",1121.0,242.0,3.636364,0,0,0,0,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5725,0,: enron case study update wow ! all on the sam...,2043.0,513.0,2.984405,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5726,0,": interest david , please , call shirley crens...",1012.0,274.0,2.697080,0,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
del new_df["modified1_text"]

In [30]:
new_df.head()

Unnamed: 0,spam,char_count,word_count,avg_word_length,!,$,%,*,?,+,...,zadorozhny,ze,zero,zhang,zhendong,zimin,zip,ziplip,zipter,zone
0,1,1447.0,322.0,3.496894,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,581.0,87.0,5.689655,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,417.0,85.0,3.917647,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,482.0,96.0,4.03125,2,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,220.0,50.0,3.42,1,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
X = new_df.iloc[:,1:]

In [32]:
X

Unnamed: 0,char_count,word_count,avg_word_length,!,$,%,*,?,+,#,...,zadorozhny,ze,zero,zhang,zhendong,zimin,zip,ziplip,zipter,zone
0,1447.0,322.0,3.496894,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,581.0,87.0,5.689655,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,417.0,85.0,3.917647,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,482.0,96.0,4.031250,2,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,220.0,50.0,3.420000,1,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,1151.0,295.0,2.905085,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5724,1121.0,242.0,3.636364,0,0,0,0,7,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5725,2043.0,513.0,2.984405,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5726,1012.0,274.0,2.697080,0,0,0,0,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
y = new_df.iloc[:,0]

In [34]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have a DataFrame named df with "text" and "label" columns
# and you want to perform classification

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=42)

# Create a Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier on the training data
model = clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.9205933682373473
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95       856
           1       0.84      0.84      0.84       290

    accuracy                           0.92      1146
   macro avg       0.89      0.90      0.90      1146
weighted avg       0.92      0.92      0.92      1146



In [39]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9205933682373473


In [49]:
l =[]
for i in new_df.iloc[0,1:]:
  l.append(i)

In [54]:
model.predict([l])



array([1])