In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv(r"C:\Users\abhis\Desktop\Celebal\Sourav\NLP Assignment\dataset\not_preprocessed_data.csv")
df.head()


Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",Partnership
1,Government test prep platform Adda247 on Octob...,Funding
2,Private equity and venture capital investments...,Merger/Acquisition
3,Digital book-keeping startup Khatabook said on...,Funding
4,Events are always important and exciting to or...,Research


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7068 entries, 0 to 7067
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_body  7068 non-null   object
 1   category      7068 non-null   object
dtypes: object(2)
memory usage: 110.6+ KB


In [4]:
df.category.value_counts()


Partnership           1587
IPO                   1413
Merger/Acquisition     990
Finance                989
Conference News        892
Funding                728
Research               469
Name: category, dtype: int64

In [5]:
# Text Preprocessing - remove punctuations and special characters, convert to lower case, remove stopwords, lemmatize, padding for lstm model

# remove punctuations and special characters
df['text'] = df['article_body'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

# convert to lower case
df['text'] = df['text'].apply(lambda x: x.lower())

# remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in stop_words]))

# lemmatize
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in x.split()]))
df.head()

# cut the text length to 2000

df['text'] = df['text'].apply(lambda x: x[:2000])
df.head()

Unnamed: 0,article_body,category,text
0,"Long COVID community, which is an open and gro...",Partnership,long covid community open growing community pe...
1,Government test prep platform Adda247 on Octob...,Funding,government test prep platform adda october sai...
2,Private equity and venture capital investments...,Merger/Acquisition,private equity venture capital investment diwa...
3,Digital book-keeping startup Khatabook said on...,Funding,digital book keeping startup khatabook said au...
4,Events are always important and exciting to or...,Research,event always important exciting organize kind ...


In [6]:
# min length of text
print(min([len(i) for i in df.text]))

# max length of text
print(max([len(i) for i in df.text]))


55
2000


In [7]:
# balanced dataset

df = df.groupby('category').head(600)
df.category.value_counts()

Partnership           600
Funding               600
Merger/Acquisition    600
Conference News       600
Finance               600
IPO                   600
Research              469
Name: category, dtype: int64

In [8]:
df.isna().sum()

article_body    0
category        0
text            0
dtype: int64

In [9]:
# use map function to map the category to numerical values
df['category'] = df['category'].map({'IPO': 0, 'Partnership': 1, 'Merger/Acquisition': 2,
                                    'Finance': 3, 'Conference News': 4, 'Funding': 5, "Research": 6})
df.head()


Unnamed: 0,article_body,category,text
0,"Long COVID community, which is an open and gro...",1,long covid community open growing community pe...
1,Government test prep platform Adda247 on Octob...,5,government test prep platform adda october sai...
2,Private equity and venture capital investments...,2,private equity venture capital investment diwa...
3,Digital book-keeping startup Khatabook said on...,5,digital book keeping startup khatabook said au...
4,Events are always important and exciting to or...,6,event always important exciting organize kind ...


In [10]:
# create new dataframe with only text and category
new_df = df[['text', 'category']]
# new_df.head()

new_df.to_csv(
    r"C:\Users\abhis\Desktop\Celebal\Sourav\NLP Assignment\dataset\final_preprocessed_data.csv", index=False)

new_df = pd.read_csv(r"C:\Users\abhis\Desktop\Celebal\Sourav\NLP Assignment\dataset\final_preprocessed_data.csv")
new_df.head()


Unnamed: 0,text,category
0,long covid community open growing community pe...,1
1,government test prep platform adda october sai...,5
2,private equity venture capital investment diwa...,2
3,digital book keeping startup khatabook said au...,5
4,event always important exciting organize kind ...,6
