In [1]:
import pandas as pd
import numpy as np


In [2]:
import glob
# merge multiple csv files into one

def merge_csv_files():
    # get all csv files from dataset folder
    csv_files = glob.glob('dataset/news_categories/*.csv')
    print('Found {} csv files'.format(len(csv_files)))
    print(csv_files)
    # read all csv files
    csv_data = [pd.read_csv(file) for file in csv_files]
    # merge all csv files into one
    merged_csv = pd.concat(csv_data, ignore_index=True)
    # save merged csv file
    merged_csv.to_csv('dataset/final_dataset.csv', index=False)
    print('Merged csv file saved successfully!')


merge_csv_files()


Found 8 csv files
['dataset/news_categories\\conference_news.csv', 'dataset/news_categories\\finance_news.csv', 'dataset/news_categories\\funding_news.csv', 'dataset/news_categories\\ipo_news.csv', 'dataset/news_categories\\lawsuit_news.csv', 'dataset/news_categories\\merger_news.csv', 'dataset/news_categories\\partnership_news.csv', 'dataset/news_categories\\research_news.csv']
Merged csv file saved successfully!


In [3]:
df = pd.read_csv('dataset/final_dataset.csv')
df.head()

Unnamed: 0,title,date,article_body,link,category,summary,company_name,funds
0,Relyance AI Selected as Top 10 Cybersecurity F...,03/28/2023 - 01:37 PM,"SAN FRANCISCO--(BUSINESS WIRE)--Relyance AI, a...",https://www.businesswire.com/news/home/2023032...,Conference News,"SAN FRANCISCO--(BUSINESS WIRE)--Relyance AI, a...","['AI', 'the RSA Conference']",[]
1,CORRECTING and REPLACING Enterprise Connect 20...,03/28/2023 - 01:31 PM,Bandwidth and Theta Lake Tie for Overall Best ...,https://www.businesswire.com/news/home/2023032...,Conference News,Bandwidth and Theta Lake Tie for Overall Best ...,[],[]
2,Henry Schein Announces Presence at the 2023 As...,03/28/2023 - 01:00 PM,"MELVILLE, N.Y.--(BUSINESS WIRE)--Henry Schein,...",https://www.businesswire.com/news/home/2023032...,Conference News,"MELVILLE, N.Y.--(BUSINESS WIRE)--Henry Schein,...","['MELVILLE', 'N.Y.--(BUSINESS WIRE)--Henry Sch...",[]
3,Multinationals consider the CIIe as a springbo...,03/28/2023 - 12:49 PM,An exhibitor (right) presents products to visi...,https://www.businesswire.com/news/home/2023032...,Conference News,An exhibitor (right) presents products to visi...,[],[]
4,"J-Squared Unveils the FALC-20, their AI-Infere...",03/28/2023 - 12:00 PM,TORONTO--(BUSINESS WIRE)--J-Squared and Blaize...,https://www.businesswire.com/news/home/2023032...,Conference News,TORONTO--(BUSINESS WIRE)--J-Squared and Blaize...,"['FALC', 'ISCwest']",[]


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8548 entries, 0 to 8547
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         8548 non-null   object
 1   date          8548 non-null   object
 2   article_body  8527 non-null   object
 3   link          8548 non-null   object
 4   category      8548 non-null   object
 5   summary       8524 non-null   object
 6   company_name  8548 non-null   object
 7   funds         8527 non-null   object
dtypes: object(8)
memory usage: 534.4+ KB


In [5]:
df.category.value_counts()

Partnership           1943
IPO                   1450
Conference News       1186
Finance                990
Merger/Acquisition     990
Funding                750
Lawsuit News           722
Research               517
Name: category, dtype: int64

In [6]:
# check for missing values
df.isnull().sum()

title            0
date             0
article_body    21
link             0
category         0
summary         24
company_name     0
funds           21
dtype: int64

In [7]:
# drop missing values
df.dropna(inplace=True)
df.isnull().sum()

title           0
date            0
article_body    0
link            0
category        0
summary         0
company_name    0
funds           0
dtype: int64

In [8]:
# check for duplicates
df.duplicated().sum()
# drop duplicates
df.drop_duplicates(inplace=True)

In [9]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8490 entries, 0 to 8547
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         8490 non-null   object
 1   date          8490 non-null   object
 2   article_body  8490 non-null   object
 3   link          8490 non-null   object
 4   category      8490 non-null   object
 5   summary       8490 non-null   object
 6   company_name  8490 non-null   object
 7   funds         8490 non-null   object
dtypes: object(8)
memory usage: 597.0+ KB


In [10]:
# keep only article_body and category columns
df = df[['article_body', 'category']]
df.head()

Unnamed: 0,article_body,category
0,"SAN FRANCISCO--(BUSINESS WIRE)--Relyance AI, a...",Conference News
1,Bandwidth and Theta Lake Tie for Overall Best ...,Conference News
2,"MELVILLE, N.Y.--(BUSINESS WIRE)--Henry Schein,...",Conference News
3,An exhibitor (right) presents products to visi...,Conference News
4,TORONTO--(BUSINESS WIRE)--J-Squared and Blaize...,Conference News


In [11]:
# randomize the dataset
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",Partnership
1,Government test prep platform Adda247 on Octob...,Funding
2,Private equity and venture capital investments...,Merger/Acquisition
3,Digital book-keeping startup Khatabook said on...,Funding
4,Events are always important and exciting to or...,Research


In [12]:
df.category.value_counts()

Partnership           1916
IPO                   1450
Conference News       1183
Merger/Acquisition     990
Finance                989
Funding                729
Lawsuit News           722
Research               511
Name: category, dtype: int64

In [190]:
# drop that rows which have words less than 20 in article_body
df = df[df.article_body.apply(lambda x: len(x.split()) > 20)]

kt = df[df.category != 'Lawsuit News']
kt.to_csv('dataset/not_preprocessed_data.csv', index=False)

In [163]:
# perform text preprocessing on the dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# download stopwords
nltk.download('stopwords')
# download wordnet
nltk.download('wordnet')

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# initialize stemmer
stemmer = PorterStemmer()

# function to perform text preprocessing
def text_preprocessing(text):
    # remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # convert to lowercase
    text = text.lower()
    # tokenize text
    text = text.split()
    # remove stopwords
    text = [word for word in text if word not in stopwords.words('english')]
    # lemmatize text
    text = [lemmatizer.lemmatize(word) for word in text]
    # stem text
    text = [stemmer.stem(word) for word in text]
    # join text
    text = ' '.join(text)
    return text

# apply text preprocessing on article_body column
df['article_body'] = df['article_body'].apply(text_preprocessing)
df.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,article_body,category
0,reuter u intern trade commiss said tuesday wou...,IPO
1,driffl global digit game marketplac announc no...,Funding
2,unit state canada said friday would work toget...,IPO
3,boston busi wire strand therapeut programm mrn...,Conference News
4,million initi equiti commit invest prime logis...,Partnership


In [164]:
# save preprocessed dataset
df.to_csv('dataset/preprocessed_final_dataset.csv', index=False)

In [165]:
df = pd.read_csv('dataset/preprocessed_final_dataset.csv')

df.head()

Unnamed: 0,article_body,category
0,reuter u intern trade commiss said tuesday wou...,IPO
1,driffl global digit game marketplac announc no...,Funding
2,unit state canada said friday would work toget...,IPO
3,boston busi wire strand therapeut programm mrn...,Conference News
4,million initi equiti commit invest prime logis...,Partnership


In [166]:
# word count of each article
df['word_count'] = df['article_body'].apply(lambda x: len(str(x).split()))
df.head()

Unnamed: 0,article_body,category,word_count
0,reuter u intern trade commiss said tuesday wou...,IPO,22
1,driffl global digit game marketplac announc no...,Funding,222
2,unit state canada said friday would work toget...,IPO,20
3,boston busi wire strand therapeut programm mrn...,Conference News,29
4,million initi equiti commit invest prime logis...,Partnership,24


In [169]:
# drop the rows which have word count less than 30
kf = df[df['word_count'] > 20]
kf.head()

Unnamed: 0,article_body,category,word_count
0,reuter u intern trade commiss said tuesday wou...,IPO,22
1,driffl global digit game marketplac announc no...,Funding,222
3,boston busi wire strand therapeut programm mrn...,Conference News,29
4,million initi equiti commit invest prime logis...,Partnership,24
7,busi wire indiain power address prime minist d...,Research,383


In [170]:
kf.category.value_counts()

Partnership           1368
IPO                   1217
Merger/Acquisition     990
Finance                989
Conference News        835
Funding                728
Lawsuit News           690
Research               444
Name: category, dtype: int64

In [171]:
# number of rows per each category should be equal to 60
kt = kf.groupby('category').head(600)
kt.category.value_counts()

IPO                   600
Funding               600
Conference News       600
Partnership           600
Lawsuit News          600
Merger/Acquisition    600
Finance               600
Research              444
Name: category, dtype: int64

In [174]:
kt.head()

Unnamed: 0,article_body,category
0,reuter u intern trade commiss said tuesday wou...,IPO
1,driffl global digit game marketplac announc no...,Funding
3,boston busi wire strand therapeut programm mrn...,Conference News
4,million initi equiti commit invest prime logis...,Partnership
7,busi wire indiain power address prime minist d...,Research


In [177]:

kt.drop('word_count', axis=1, inplace=True)

kt = kt[kt.category != 'Lawsuit News']
kt.to_csv('dataset/600_data.csv', index=False)