In [12]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import gensim
nltk.download('punkt')
nltk.download('stopwords')
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix,hstack
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ProBook\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ProBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
data = os.path.abspath(os.path.join('.','data', 'spam_discord_data.csv'))

In [14]:
df= pd.read_csv(data)

In [15]:
df.head()

Unnamed: 0,Channel,AuthorID,Author,Date,Content,Attachments,Reactions,Spam_or_Scam
0,Support,30a9375c-708c-4c77-99d4-48a2af6d1316,nreeves,2024-02-15 16:17:46,Walk I paper suddenly still stop.,,8,0
1,Announcements,12e7307c-df80-4557-97c4-9a9ca871a238,dmiller,2024-01-25 03:45:05,Voice television free building house step lose...,,6,0
2,General,aa9b28af-3f66-4043-9158-54e2d8ac7b6c,jonathan76,2024-02-26 18:26:26,Movie fast cold reach field girl forward best ...,http://www.arroyo-smith.biz/,7,0
3,Support,33421fc7-17f6-45c0-a23d-39889e268b98,ygarner,2024-01-28 15:17:03,Color themselves.,,5,1
4,Support,e0a02c63-63b6-4db5-a2d7-8b80b19398ec,lawrencelong,2024-03-01 00:48:48,Heavy recognize sea trip fill safe former ques...,,9,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Channel       10000 non-null  object
 1   AuthorID      10000 non-null  object
 2   Author        10000 non-null  object
 3   Date          10000 non-null  object
 4   Content       10000 non-null  object
 5   Attachments   5052 non-null   object
 6   Reactions     10000 non-null  int64 
 7   Spam_or_Scam  10000 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 625.1+ KB


In [17]:
df = df[['Content','Spam_or_Scam']]
df.head()

Unnamed: 0,Content,Spam_or_Scam
0,Walk I paper suddenly still stop.,0
1,Voice television free building house step lose...,0
2,Movie fast cold reach field girl forward best ...,0
3,Color themselves.,1
4,Heavy recognize sea trip fill safe former ques...,0


### Data Cleaning,Preprocessing and Feature Engineering

In [18]:
import re 
class TextPreprocessor():
  

  def extract_email_ids(self,doc):
    '''This functions extract the email ids and domain names in the email adderss and returns a list of preprocessed email ids'''
    list_of_preproessed_emails = []
    list_of_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',doc)
    doc = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'," ",doc)
    for txt in list_of_emails:
      email = re.split("[.]",re.split("@", txt)[1])
      y=email.copy()
      for i in email:
        if i=="com" or len(i)<=2:
          y.remove(i)
      email = ' '.join([str(i) for i in y])
      email = email.lower()
      list_of_preproessed_emails.append(email)
    list_of_preproessed_emails = " ".join(list_of_preproessed_emails)
    return list_of_preproessed_emails

  def text_lowercase(self,doc):
    ''' This function converts the text to lower case'''
    return doc.lower()
  def remove_digits(self, doc):
    '''This function removes all the numbers'''
    return re.sub('\d', '', doc)

  def remove_underscores(self, doc):
    '''This function removes all the underscores'''
    return re.sub(r'_', '', doc)

  def remove_excess_whitespace(self, doc):
    '''This function removes excess white spaces'''
    return re.sub('\s+', ' ', doc)

  def remove_special_characters(self, doc):
    '''This function removes all the special characters'''
    return re.sub('\W', ' ', doc)

  def remove_within_brackets(self, doc):
    '''This function removes all the content within brackets'''
    text = re.sub(r'', '', doc)
    text = re.sub(r'<[^()]*>', '', text)
    return text
  
  def expand_words(self, phrase):
    '''This function expands the short form words '''
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

  def remove_short_and_long_words(self, doc):

    '''This function removes all the short(<2 letters) and long(>15 letters) words '''
    words = doc.split()
    word_list = []
    for word in words:
      if len(word) > 2 and len(word) < 15 :
        word_list.append(word)
    return ' '.join(word_list)

  def text_lematizer(self,doc):
    '''This function lematize the words to its root words'''
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = nlp(doc)
    return " ".join([token.lemma_ for token in doc])

  def clean_document(self,doc):
    '''This function cleans the documents'''
    doc = self.text_lowercase(doc)
    ids = self.extract_email_ids(doc)
    doc = self.remove_within_brackets(doc)
    doc = self.expand_words(doc)
    doc = self.remove_underscores(doc)
    doc = self.remove_special_characters(doc)
    doc = self.remove_digits(doc)
    doc = self.remove_excess_whitespace(doc)
    doc = self.remove_short_and_long_words(doc)
    doc = self.text_lematizer(doc)
    doc = ids+doc
    return doc

  

In [20]:
col = ["Word Count","Average Word length","Topic 1","Topic 2"]

In [21]:
col.extend(["tfidf_d"+str(i) for i in range(0,3286)])#column name for pandas dataframe
col.extend(["avg_wv_d"+str(i) for i in range(0,300)])
col.extend(["tfidf_w2v_d"+str(i) for i in range(0,300)])

In [22]:
tp = TextPreprocessor()
processed_text = []
for i in tqdm(df['Content']):
  processed_text.append(tp.clean_document(i))

100%|██████████| 10000/10000 [2:52:20<00:00,  1.03s/it]    


In [23]:
df['clean_text'] = processed_text

In [24]:
df.to_csv('clean_data.csv')

In [25]:
df = pd.read_csv('clean_data.csv')

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,Content,Spam_or_Scam,clean_text
0,0,Walk I paper suddenly still stop.,0,walk paper suddenly still stop
1,1,Voice television free building house step lose...,0,voice television free building house step lose...
2,2,Movie fast cold reach field girl forward best ...,0,movie fast cold reach field girl forward good ...
3,3,Color themselves.,1,color themselves
4,4,Heavy recognize sea trip fill safe former ques...,0,heavy recognize sea trip fill safe former ques...
