<a href="https://colab.research.google.com/github/callezenwaka/machine-learning/blob/main/CNN_Fundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [17]:
# import dependencies
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn

In [18]:
# define seed
SEED = 1234

In [19]:
# defince set_seeds function
def set_seeds(seed=1234):
  """Set seeds for reproducibility."""
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed) # multi-GPU

In [20]:
# set seeds for reproducibility
set_seeds(seed=SEED)

In [21]:
# set device
cuda = True
device = torch.device("cuda" if (torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
  torch.set_default_tensor_type("torch.cuda.FloatTensor")
print("device: ", device) 

device:  cuda


# Load Data

In [22]:
# load data
url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

Unnamed: 0,title,category
0,Sharon Accepts Plan to Reduce Gaza Army Operat...,World
1,Internet Key Battleground in Wildlife Crime Fight,Sci/Tech
2,July Durable Good Orders Rise 1.7 Percent,Business
3,Growing Signs of a Slowing on Wall Street,Business
4,The New Faces of Reality TV,World


In [23]:
# reduce data size (too large to fit in colab's limited memory)
df = df[:10000]
print(len(df))

10000


# Preprocessing

We're going to clean up our input data first by doing operations such as lower text, removing stop (filler) words, filters using regular expressions, etc.

In [24]:
# import dependencies
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [25]:
# download stopwords
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')
print(STOPWORDS[:5])
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we']


In [26]:
def preprocess(text, stopwords=STOPWORDS):
  """Conditional preprocessing on our text, unique to our task."""

  # lower
  text = text.lower()

  # remove stopwords
  pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
  text = pattern.sub('', text)

  # remove words in parenthesis
  text = re.sub(r'\([^)]*\)', '', text)

  # spacing and filters
  text = re.sub(r"([-;;.,!?<=>])", r"\1 ", text)
  text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric chars
  text = re.sub(' +', ' ', text) # remove multiple spaces
  text = text.strip()

  return text

In [27]:
# sample
text = "Great week for the NYSE"
preprocess(text=text)

'great week nyse'

In [28]:
# apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print(f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")

Sharon Accepts Plan to Reduce Gaza Army Operation, Haaretz Says

sharon accepts plan reduce gaza army operation haaretz says


# Split Data

In [29]:
import collections
from sklearn.model_selection import train_test_split

In [30]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [31]:
def train_val_test_split(X, y, train_size):
  """Split dataset into data splits."""
  X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
  X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
  return X_train, X_val, X_test, y_train, y_val, y_test

In [33]:
# data
X = preprocessed_df["title"].values
y = preprocessed_df["category"].values

In [34]:
# create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X=X, y=y, train_size=TRAIN_SIZE)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"Sample point: {X_train[0]} → {y_train[0]}")

X_train: (7000,), y_train: (7000,)
X_val: (1500,), y_val: (1500,)
X_test: (1500,), y_test: (1500,)
Sample point: lost flu paydays → Business


# Label Encoding

In [35]:
import itertools

In [None]:
class LabelEncoder(object):
  """Label encoder for tag labels."""

  def __init__(self, class_to_index={}):
    self.class_to_index = class_to_index
    self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    self.class