In [1]:
import pandas as pd
from datetime import datetime
import nltk
import ast
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.corpus import stopwords

In [2]:
def preprocess(text):
    result = ""
    punctless = re.sub('[^a-zA-Z]', ' ', text)
    lowercase = punctless.lower()
    words_list = lowercase.split()
    words_list =  [word for word in words_list if not word in set(stopwords.words('english'))]
    for word in words_list:
        result += " " + word.strip()
    return result

# Short samples preproc

In [3]:
raw_short = pd.read_csv("data/raw/spam-text-message.csv")

In [4]:
raw_short.insert( 2, "text", "Empty")
raw_short.insert( 3, "label", "Empty")

In [5]:
for ind in raw_short.index:
    if raw_short["text"][ind] == "Empty":
        message = raw_short["Message"][ind]
        raw_short["text"][ind] = preprocess(message)
    
    if raw_short["label"][ind] == "Empty":
        if raw_short["Category"][ind] == "ham":
            raw_short["label"][ind] = 0
        elif raw_short["Category"][ind] == "spam":
            raw_short["label"][ind] = 1

In [15]:
processed_short = raw_short[["text","label"]]
processed_short.dropna()

Unnamed: 0,text,label
0,go jurong point crazy available bugis n great...,0
1,ok lar joking wif u oni,0
2,free entry wkly comp win fa cup final tkts st...,1
3,u dun say early hor u c already say,0
4,nah think goes usf lives around though,0
...,...,...
5567,nd time tried contact u u pound prize claim e...,1
5568,b going esplanade fr home,0
5569,pity mood suggestions,0
5570,guy bitching acted like interested buying som...,0


In [16]:
processed_short.to_csv("data/processed/processed_short.csv", index=False)

# Medium samples preproc

In [3]:
raw_medium = pd.read_csv("data/raw/IMDB Dataset.csv")

In [4]:
raw_medium.insert( 2, "text", "Empty")
raw_medium.insert( 3, "label", "Empty")

In [8]:
for ind in raw_medium.index:
    if raw_medium["text"][ind] == "Empty":
        review = raw_medium["review"][ind]
        raw_medium["text"][ind] = preprocess(review)
    
    if raw_medium["label"][ind] == "Empty":
        if raw_medium["sentiment"][ind] == "negative":
            raw_medium["label"][ind] = 0
        elif raw_medium["sentiment"][ind] == "positive":
            raw_medium["label"][ind] = 1

In [10]:
processed_medium = raw_medium[["text","label"]]
processed_medium.dropna()

Unnamed: 0,text,label
0,one reviewers mentioned watching oz episode h...,1
1,wonderful little production br br filming tec...,1
2,thought wonderful way spend time hot summer w...,1
3,basically family little boy jake thinks zombi...,0
4,petter mattei love time money visually stunni...,1
...,...,...
49995,thought movie right good job creative origina...,1
49996,bad plot bad dialogue bad acting idiotic dire...,0
49997,catholic taught parochial elementary schools ...,0
49998,going disagree previous comment side maltin o...,0


In [11]:
processed_medium.to_csv("data/processed/processed_medium.csv", index=False)

# Long samples preproc

In [3]:
from pathlib import Path
rows_list = []
for i in range(1,512):
    txt = Path("data/raw/bbc/sport/"+str(i).zfill(3)+'.txt').read_text()
    dict1 = {}
    dict1.update({'text':preprocess(txt)})
    dict1.update({'label': 1})
    rows_list.append(dict1)

for j in range(1,402):
    txt = Path("data/raw/bbc/tech/"+str(j).zfill(3)+'.txt').read_text()
    dict2 = {}
    dict2.update({'text':preprocess(txt)})
    dict2.update({'label': 1})
    rows_list.append(dict2)

processed_long = pd.DataFrame(rows_list, columns=["text", "label"])

In [5]:
processed_long.to_csv("data/processed/processed_long.csv", index=False)