In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('complete_data.csv')
df.head()

Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",Partnership
1,Government test prep platform Adda247 on Octob...,Funding
2,Private equity and venture capital investments...,Merger/Acquisition
3,Digital book-keeping startup Khatabook said on...,Funding
4,Events are always important and exciting to or...,Research


In [2]:
# check if data is balance or unbalanced
counts = df['category'].value_counts()
print(counts)

Partnership           1587
IPO                   1413
Merger/Acquisition     990
Finance                989
Conference News        892
Funding                728
Research               469
Name: category, dtype: int64


In [3]:
# balance data
df = df.groupby('category').head(700)
df.category.value_counts()

Partnership           700
Funding               700
Merger/Acquisition    700
Conference News       700
Finance               700
IPO                   700
Research              469
Name: category, dtype: int64

In [4]:
# droping research category because of less data points
df = df.drop(df[df['category'] == 'Research'].index)


In [5]:
df.category.value_counts()

Partnership           700
Funding               700
Merger/Acquisition    700
Conference News       700
Finance               700
IPO                   700
Name: category, dtype: int64

In [6]:
# use map function to map the category to numerical values
df['category'] = df['category'].map({'Funding': 0, 'Partnership': 1, 'Merger/Acquisition': 2, 'Finance': 3, 'Conference News': 4, 'IPO': 5, "Research": 6})
df.head()

Unnamed: 0,article_body,category
0,"Long COVID community, which is an open and gro...",1
1,Government test prep platform Adda247 on Octob...,0
2,Private equity and venture capital investments...,2
3,Digital book-keeping startup Khatabook said on...,0
5,it easier for everyone to experience the world...,1


In [7]:
#tokenization

import re
import spacy
import string
nlp = spacy.load("en_core_web_sm")

def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    # remove punctuation
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]') 
    # convert to lower case
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in nlp.tokenizer(nopunct)]



In [8]:
#count number of occurences of each word
from collections import Counter
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['article_body']))

In [10]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 32488
num_words after: 19504


In [11]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [12]:
# To encode word into index using vocab2index
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [13]:
df['encoded'] = df['article_body'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df.head()

  df['encoded'] = df['article_body'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


Unnamed: 0,article_body,category,encoded
0,"Long COVID community, which is an open and gro...",1,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 12, 13, 1..."
1,Government test prep platform Adda247 on Octob...,0,"[[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, ..."
2,Private equity and venture capital investments...,2,"[[283, 284, 10, 285, 54, 286, 287, 18, 288, 28..."
3,Digital book-keeping startup Khatabook said on...,0,"[[309, 310, 311, 141, 312, 45, 42, 313, 314, 1..."
5,it easier for everyone to experience the world...,1,"[[46, 395, 12, 396, 100, 397, 18, 398, 5, 18, ..."


In [14]:
df.to_csv('processed_data.csv', index=False)
df.head()

Unnamed: 0,article_body,category,encoded
0,"Long COVID community, which is an open and gro...",1,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 12, 13, 1..."
1,Government test prep platform Adda247 on Octob...,0,"[[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, ..."
2,Private equity and venture capital investments...,2,"[[283, 284, 10, 285, 54, 286, 287, 18, 288, 28..."
3,Digital book-keeping startup Khatabook said on...,0,"[[309, 310, 311, 141, 312, 45, 42, 313, 314, 1..."
5,it easier for everyone to experience the world...,1,"[[46, 395, 12, 396, 100, 397, 18, 398, 5, 18, ..."
