In [5]:
import pandas as pd
import numpy as np

import re
import nltk

import warnings
warnings.filterwarnings("ignore")

In [6]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [3]:
df=pd.read_csv("/content/blogs.csv")
text_col = "Data"

In [4]:
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [8]:
stop_words=set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_text(text):
  text = str(text).lower()
  text = re.sub(r"http\S+|www\S+|https\S+", "", text)
  text = re.sub(r"[^a-z\s]", "", text)

  # Tokenization
  tokens = word_tokenize(text)

  # Remove stopwords
  tokens = [word for word in tokens if word not in stop_words]
  # Stemming
  stemmed = [stemmer.stem(word) for word in tokens]

  #Lemmatize
  lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

  return {
        "clean_text": " ".join(tokens),
        "stemmed": " ".join(stemmed),
        "lemmatized": " ".join(lemmatized)
    }


In [13]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
df_preprocessed =  df[text_col].apply(preprocess_text)

In [16]:
df_out = pd.DataFrame(list(df_preprocessed))

In [17]:
df_final = pd.concat([df, df_out], axis=1)

In [18]:
df_final

Unnamed: 0,Data,Labels,clean_text,stemmed,lemmatized
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism,newsgroups altatheism path cantaloupesrvcscmue...,newsgroup altath path cantaloupesrvcscmueducra...,newsgroups altatheism path cantaloupesrvcscmue...
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism,path cantaloupesrvcscmuedudasnewsharvardedunoc...,path cantaloupesrvcscmuedudasnewsharvardedunoc...,path cantaloupesrvcscmuedudasnewsharvardedunoc...
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism,xref cantaloupesrvcscmuedu altatheism talkreli...,xref cantaloupesrvcscmuedu altath talkreligion...,xref cantaloupesrvcscmuedu altatheism talkreli...
...,...,...,...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc,xref cantaloupesrvcscmuedu talkabortion altath...,xref cantaloupesrvcscmuedu talkabort altath ta...,xref cantaloupesrvcscmuedu talkabortion altath...
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc,xref cantaloupesrvcscmuedu talkreligionmisc ta...,xref cantaloupesrvcscmuedu talkreligionmisc ta...,xref cantaloupesrvcscmuedu talkreligionmisc ta...
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc,xref cantaloupesrvcscmuedu talkorigins talkrel...,xref cantaloupesrvcscmuedu talkorigin talkreli...,xref cantaloupesrvcscmuedu talkorigins talkrel...
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc,xref cantaloupesrvcscmuedu talkreligionmisc al...,xref cantaloupesrvcscmuedu talkreligionmisc al...,xref cantaloupesrvcscmuedu talkreligionmisc al...


In [19]:
df_final.to_csv("preprocessed_blog.csv", index=False)

In [21]:
df_final.head()

Unnamed: 0,Data,Labels,clean_text,stemmed,lemmatized
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism,newsgroups altatheism path cantaloupesrvcscmue...,newsgroup altath path cantaloupesrvcscmueducra...,newsgroups altatheism path cantaloupesrvcscmue...
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism,path cantaloupesrvcscmuedudasnewsharvardedunoc...,path cantaloupesrvcscmuedudasnewsharvardedunoc...,path cantaloupesrvcscmuedudasnewsharvardedunoc...
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...,path cantaloupesrvcscmuedumagnesiumclubcccmued...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism,xref cantaloupesrvcscmuedu altatheism talkreli...,xref cantaloupesrvcscmuedu altath talkreligion...,xref cantaloupesrvcscmuedu altatheism talkreli...
