In [14]:
!nvidia-smi


Sat Aug 24 09:24:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Download Data from Kaggle (Quora Insincere Comments Classification)

In [None]:
# Downloading Data from Kaggle

In [None]:
!kaggle competitions download -c quora-insincere-questions-classification


quora-insincere-questions-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


# New Section

In [10]:
from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [11]:
!pwd

/content


In [12]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content'


In [13]:
!kaggle competitions download -c quora-insincere-questions-classification -f train.csv
!kaggle competitions download -c quora-insincere-questions-classification -f sample_submission.csv
!kaggle competitions download -c quora-insincere-questions-classification -f test.csv


Downloading train.csv.zip to /content
 75% 41.0M/54.9M [00:00<00:00, 52.1MB/s]
100% 54.9M/54.9M [00:00<00:00, 59.9MB/s]
Downloading sample_submission.csv.zip to /content
100% 4.09M/4.09M [00:00<00:00, 35.1MB/s]
100% 4.09M/4.09M [00:00<00:00, 35.0MB/s]
Downloading test.csv.zip to /content
 32% 5.00M/15.8M [00:00<00:00, 23.9MB/s]
100% 15.8M/15.8M [00:00<00:00, 60.4MB/s]


In [7]:
train_fname = './train.csv.zip'
test_fname = './test.csv.zip'
sub_fname = './sample_submission.csv.zip'

In [8]:
# from google.colab import drive
# drive.mount('/content/drive')

In [15]:
import pandas as pd

In [16]:
raw_df = pd.read_csv(train_fname)
test_df = pd.read_csv(test_fname)
sub_df = pd.read_csv(sub_fname)


In [17]:
raw_df.sample(10)

Unnamed: 0,qid,question_text,target
1279012,faa95274dd7640bab971,Have you ever been in Bursa?,0
318913,3e8004130e1351c811e4,"What does ""social camouflage"" mean in people w...",0
212670,2997b50ebdd7ee59f281,What is the cheapest and fastest way to get a ...,0
271432,3521538385a7da220560,Can a young person remain healthy if they stop...,0
1146615,e0a909c7d911b27a4498,Is it legal for a news channel to deny someone...,0
853974,a75198d4bb82d5f9e542,Is there a separate course of biology in first...,0
761830,9541e9e55b529657ba40,How hard is it to get a job at Apple? I'm 20 a...,0
100926,13c53d6401d5c772100c,What is the basic importing process for food i...,0
986458,c141144ea8106007c0d0,"Is Nedry's ""Ah ah ah you didn't say the magic ...",0
1278455,fa8d57a41009b86440b2,What is the best microcontroller to record EMG...,0


In [18]:
raw_df.shape


(1306122, 3)

In [19]:
SAMPLE_SIZE = 100_000
sample_df = raw_df.sample(SAMPLE_SIZE)

In [20]:
sample_df

Unnamed: 0,qid,question_text,target
406150,4f951d069cc696fb0ee0,What course shall I do after 12th BPC to becom...,0
928647,b5fe0cd17f2f3d4e08b4,If you cut something then why you get two part...,0
755387,94008cf131a30ff69123,"What are the different types of printers, and ...",0
410820,5082393762d65b35bd56,Was another 2002 genocide in waiting if Modi l...,1
1015852,c70f50e2468a0a42e443,What are the benefits and draw backs of sittin...,0
...,...,...,...
264003,33aa554804abedf33618,"If a beginner were to prepare for the IIO, wha...",0
13059,0292322af42ba9ebfb5a,I am doing my bachelors in English language an...,0
106718,14e6dc812351914e3807,What are the types of superchargers?,0
75225,0eba36b8fde5c2dc5393,"Is there a non-linear, continuous, differentia...",0


In [21]:
sample_df.target.value_counts(normalize=True) # Class imbalance in this problem

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.93814
1,0.06186


## Prepare Data for training

1.   Convert text to tf idf vectors
2.   Convert Vectors to pytorch tensors
3. Create Pytorch Data Loaders



In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# w = term freq * log(N/df)

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
vectorizer = TfidfVectorizer()

In [24]:
word_tokenize('this is , something new')

['this', 'is', ',', 'something', 'new']

In [25]:
stemmer = SnowballStemmer(language = 'english')

In [26]:
stemmer.stem('going') # sends the data to the root form

'go'

In [27]:
def tokenize(text):
  return [stemmer.stem(token) for token in word_tokenize(text)]

In [28]:
tokenize('This is an amazing amazing sofa I am sitting on ')

['this', 'is', 'an', 'amaz', 'amaz', 'sofa', 'i', 'am', 'sit', 'on']

In [29]:
english_stopwords = stopwords.words('english')

In [30]:
", ".join(english_stopwords)

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [31]:
vectorizer = TfidfVectorizer(tokenizer = tokenize , stop_words=english_stopwords, max_features = 1000)

In [32]:
%%time
vectorizer.fit(sample_df.question_text)



CPU times: user 29.6 s, sys: 112 ms, total: 29.7 s
Wall time: 34.2 s


In [33]:
vectorizer.get_feature_names_out()[:100]

array(['!', '$', '%', '&', "'", "''", "'m", "'s", '(', ')', ',', '-', '.',
       '1', '10', '100', '12', '12th', '15', '2', '20', '2017', '2018',
       '3', '4', '5', '6', '7', '8', ':', '?', '[', ']', '``', 'abl',
       'abroad', 'abus', 'accept', 'access', 'accomplish', 'accord',
       'account', 'achiev', 'acid', 'act', 'action', 'activ', 'actor',
       'actual', 'ad', 'add', 'address', 'admiss', 'adult', 'advanc',
       'advantag', 'advic', 'affect', 'africa', 'african', 'age', 'ago',
       'air', 'allow', 'almost', 'alon', 'alreadi', 'also', 'altern',
       'alway', 'amazon', 'america', 'american', 'amount', 'analysi',
       'android', 'ani', 'anim', 'anoth', 'answer', 'anxieti', 'anyon',
       'anyth', 'apart', 'app', 'appear', 'appl', 'appli', 'applic',
       'approach', 'arab', 'area', 'arm', 'armi', 'around', 'art',
       'asian', 'ask', 'associ', 'atheist'], dtype=object)

In [34]:
%%time
inputs = vectorizer.transform(sample_df.question_text)

CPU times: user 28 s, sys: 57.5 ms, total: 28 s
Wall time: 29.3 s


In [35]:
inputs.shape

(100000, 1000)

In [36]:
inputs[0].toarray()[:10]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.56566962, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.07711897, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [37]:
test_inputs = vectorizer.transform(test_df.question_text)

In [38]:
targets = sample_df.target

# Split the Training and Validation Set

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets, test_size = 0.3)

In [41]:
train_inputs.shape

(70000, 1000)

In [42]:
val_targets.shape

(30000,)

# Convert to Pytorch Tensors

In [43]:
import torch


In [44]:
train_input_tensors = torch.tensor(train_inputs.toarray()).float()
val_input_tensors = torch.tensor(val_inputs.toarray()).float()

In [45]:
train_target_tensors = torch.tensor(train_targets.values).float()
val_target_tensors = torch.tensor(val_targets.values).float()

In [46]:
# test_input_tensors = torch.tensor(test_inputs.values).float()

In [47]:
# CREATE A PYTORCH DATA LOADER
from torch.utils.data import TensorDataset, DataLoader


In [48]:
train_ds = TensorDataset(train_input_tensors, train_target_tensors)
val_ds = TensorDataset(val_input_tensors, val_target_tensors)
# test_ds = TensorDataset(test_input_tensors)


In [49]:
BATCH_SIZE = 128

In [50]:
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)
val_dl = DataLoader(val_ds, batch_size = BATCH_SIZE)


In [51]:
for batch in train_dl:
  batch_inputs, batch_targets = batch
  print(batch_inputs.shape)
  print(batch_targets.shape)
  break

torch.Size([128, 1000])
torch.Size([128])


In [52]:
import torch.nn as nn
import torch.nn.functional as F

In [53]:
class QuoraNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = nn.Linear(1000, 512)
    self.layer2 = nn.Linear(512, 256)
    self.layer3 = nn.Linear(256, 128)
    self.layer4 = nn.Linear(128, 1)

  def forward(self, inputs):
    out = self.layer1(inputs)
    out = F.relu(out)
    out = self.layer2(out)
    out = F.relu(out)
    out = self.layer3(out)
    out = F.relu(out)
    out = self.layer4(out)
    out = F.sigmoid(out)
    return out

In [62]:
model = QuoraNet()

In [55]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [56]:
for batch in train_dl:
  bi, bt = batch
  print(bi.shape)
  print(bt.shape)
  bo = model(bi)
  print(bo.shape)

  # Convert outputs to probabilities
  probs = torch.sigmoid(bo[:, 0])
  print(probs.shape)

  # Convert proba to predictions
  preds = (probs > 0.5).int()
  print('predictions', preds[:10])
  print('targets', bt[:10])

  # Check the metrics
  print('accuracy', accuracy_score(bt, preds))
  print('f1', f1_score(bt, preds))
  print('precision', precision_score(bt, preds))
  print('recall', recall_score(bt, preds))
  break

torch.Size([128, 1000])
torch.Size([128])
torch.Size([128, 1])
torch.Size([128])
predictions tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int32)
targets tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
accuracy 0.046875
f1 0.08955223880597014
precision 0.046875
recall 1.0


In [57]:
# Eval the model performance
def evaluate(model, dl):
  losses , accs , f1s = [], [], []
  for batch in dl:
    inputs, targets = batch

    # pass inputs to the model
    outputs = model(inputs)

    probs = torch.sigmoid(outputs[:,0])
    loss = F.binary_cross_entropy(probs, targets)

    preds = (probs > 0.5).int()

    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds)

    losses.append(loss.item())
    accs.append(acc)
    f1s.append(f1)

  return (torch.mean(torch.tensor(losses)),
          torch.mean(torch.tensor(accs)),
          torch.mean(torch.tensor(f1s)))

In [58]:
evaluate(model ,train_dl)

(tensor(0.9362),
 tensor(0.0618, dtype=torch.float64),
 tensor(0.1156, dtype=torch.float64))

In [59]:
evaluate(model, val_dl)

(tensor(0.9361),
 tensor(0.0619, dtype=torch.float64),
 tensor(0.1159, dtype=torch.float64))

In [60]:
# Train the model batch by batch
def fit(epochs, lr, model, train_dl, val_dl):
  history = []
  opt = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = 1e-5)

  for epoch in range(epochs):
    for batch in train_dl:
      inputs, targets = batch
      outputs = model(inputs)
      probs = torch.sigmoid(outputs[:,0])
      loss = F.binary_cross_entropy(probs, targets, weight = torch.tensor([20]))
      loss.backward()
      opt.step()
      opt.zero_grad()
      loss, acc, f1 = evaluate(model, val_dl)
    print(f'Epoch: {epoch}, Loss: {loss}, Acc: {acc}, F1: {f1}')
    history.append((loss, acc, f1))
  return history

In [63]:
history = []

In [None]:
history += fit(1, 0.001, model , train_dl, val_dl)

In [72]:
history += fit(1, 0.001, model , train_dl, val_dl)

Epoch: 0, Loss: 0.6931471228599548, Acc: 0.5936059397163121, F1: 0.0953499858548162


In [None]:
losses = [item[0] for item in history]
acc = [item[1] for item in history]
f1 = [item[2] for item in history]



In [None]:
import matplotlib.pyplot as plt
plt.title('Losses')
plt.plot(losses)


In [61]:
plt.title('Accuracies')
plt.plot(acc)

NameError: name 'plt' is not defined

In [None]:
plt.title('F1 Scores')
plt.plot(f1)

In [None]:
def predict(df):
  inputs = vectorizer.transform(df.question_text)
  input_tensors = torch.tensor(inputs.toarray()).float()
  outputs = model(input_tensors)
  probs = torch.sigmoid(outputs[:,0])
  preds = (probs > 0.5).int()
  return preds

In [None]:
def predict_single_text(text):
  inputs = vectorizer.transform([text])
  input_tensors = torch.tensor(inputs.toarray()).float()
  outputs = model(input_tensors)
  probs = torch.sigmoid(outputs[:,0])
  preds = (probs > 0.5).int()
  return preds

In [None]:
prediction = predict_single_text('what is the meaning of life').detach().numpy()

In [None]:
def make_preds(dl):
  all_preds = []
  for batch in dl:
    inputs, targets = batch
    outputs = model(inputs)
    probs = torch.sigmoid(outputs[:,0])
    preds = (probs > 0.5).int()
    all_preds.append(list(preds.detach().numpy()))
    all_preds = np.concatenate(all_preds)
  return all_preds


In [None]:
test_preds = make_preds(test_dl)
sub_df.prediction = test_preds
sub_df.to_csv('submission.csv', index = False)