In [None]:
#mounting the google drive to access the data

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#required library packages

import pandas as pd
import numpy as np
import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [None]:
#function to clean the data using regular expression 

import re
import string

def text_clean(text):

  text = text.lower()                                                  # Make the text lowercase
  text = re.sub('<.*?>', '', text)                                     #remove html tag
  text = re.sub('\[.*?\]', '', text)                                  # Removing the data inside the square brackets eg: [cheering]               
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)     # Removing the punctuation marks
  text = re.sub('[‘’“”...]', '', text)        # Removing additional punctuation that we missed earlier
  text = re.sub('\n', '', text)                # Removing line-break
  text = re.sub(r"\S*https?:\S*", '', text)
  return text

In [None]:
#reading the file data

file_name = "demonetisation_ani.csv"
df = pd.read_csv("/content/drive/MyDrive/BTP/BTP_data_scraped/" + file_name)

In [None]:
#printing the first 5 rows of raw data

df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,news,news-href,title,date,content
0,1618680046-2661,https://aninews.in/search/?query=demonetisatio...,"BJP condemns 'misconceptions' about GST, demon...",https://aninews.in/news/national/general-news/...,"BJP condemns 'misconceptions' about GST, demon...","Oct 20, 2017 10:35","Chennai (Tamil Nadu) [India], Oct 20 (ANI): Bh..."
1,1618679558-2496,https://aninews.in/search/?query=demonetisatio...,Concealing facts from Parliamentary Committee ...,https://aninews.in/news/national/politics/conc...,Concealing facts from Parliamentary Committee ...,"Aug 23, 2017 11:20","New Delhi [India], Jan. 19 (ANI): Taking on Re..."
2,1618680352-2765,https://aninews.in/search/?query=demonetisatio...,"Demonetisation, a success story of India: PM Modi",https://aninews.in/news/national/general-news/...,"Demonetisation, a success story of India: PM Modi","Jan 21, 2018 21:51","New Delhi [India], Jan. 21 (ANI): Prime Minist..."
3,1618678341-2089,https://aninews.in/search/?query=demonetisatio...,Govt. forms sub-committee of CMs to tackle dem...,https://aninews.in/news/national/politics/govt...,Govt. forms sub-committee of CMs to tackle dem...,"Aug 23, 2017 11:20","New Delhi [India], Nov. 28 (ANI): In the wake ..."
4,1618678336-2087,https://aninews.in/search/?query=demonetisatio...,'It's not only one party's monopoly to end cor...,https://aninews.in/news/national/asia/039it039...,'It's not only one party's monopoly to end cor...,"Aug 23, 2017 11:20","New Delhi [India], Nov. 28 (ANI): In what may ..."


In [None]:

# cleaning the data using the fuction written above
clean = lambda x : text_clean(x)

# new clean data
df.content=df.content.astype(str)
data_clean = pd.DataFrame(df.content.apply(clean))

df["clean_data"] = data_clean["content"]


#removing the stopwords from the data
from nltk.corpus import stopwords
stop = stopwords.words('english')
df["preprocess_data"] = df["clean_data"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

#lemmatization of the words in data
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
lemm = lambda x : lemmatizer.lemmatize(x)

df["lemmatize"] = df["preprocess_data"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))


In [None]:
#printing first 5 rows of the preprocessed data 

df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,news,news-href,title,date,content,clean_data,preprocess_data,lemmatize
0,1618471765-329,https://indianexpress.com/page/39/?s=%22+Uri+%...,"China in touch with India, Pakistan to bring d...",https://indianexpress.com/article/india/india-...,"China in touch with India, Pakistan to bring d...","Updated: September 29, 2016 6:24:35 pm","<img loading=""lazy"" class=""wp-image-3056192 si...",as for the tension between pakistan and india...,tension pakistan india recently chinese side c...,tension pakistan india recently chinese side c...
1,1618471024-199,https://indianexpress.com/page/48/?s=%22+Uri+%...,J&K: BSF apprehends suspected Pak terrorist tr...,https://indianexpress.com/article/india/india-...,J&K: BSF apprehends suspected Pak terrorist tr...,"Updated: September 24, 2016 1:48:59 am","<img loading=""lazy"" class=""wp-image-3046250 si...",abdul qayoom got entangled in the barbed wire...,abdul qayoom got entangled barbed wire fence a...,abdul qayoom got entangled barbed wire fence a...
2,1618472064-371,https://indianexpress.com/page/37/?s=%22+Uri+%...,"Surgical strike a right move but late, says BS...",https://indianexpress.com/article/india/india-...,"Surgical strike a right move but late, says BS...","October 1, 2016 4:19:42 am","<img loading=""lazy"" class=""wp-image-3026372 si...",by demolishing terror camps in loc the army h...,demolishing terror camps loc army fulfilled pr...,demolishing terror camp loc army fulfilled pro...
3,1618472697-484,https://indianexpress.com/page/29/?s=%22+Uri+%...,Shaan feels the Indo-Pak artist exchange has b...,https://indianexpress.com/article/entertainmen...,Shaan feels the Indo-Pak artist exchange has b...,"Updated: October 17, 2016 12:32:21 pm","<img loading=""lazy"" class=""size-full wp-image-...",shaan on indopak artist exchange they have be...,shaan indopak artist exchange banned respect d...,shaan indopak artist exchange banned respect d...
4,1618474456-828,https://indianexpress.com/page/6/?s=%22+Uri+%2...,Uri trailer: Vicky Kaushal leads the pack in t...,https://indianexpress.com/article/entertainmen...,Uri trailer: Vicky Kaushal leads the pack in t...,"Updated: December 5, 2018 11:07:41 am","<img loading=""lazy"" class=""wp-image-5479302 si...",uri trailer vicky kaushal starrer uri will re...,uri trailer vicky kaushal starrer uri release ...,uri trailer vicky kaushal starrer uri release ...


In [None]:
# exporting the data as csv extension file

df.to_csv(file_name)