**!Download and Load**

In [None]:

!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d bittlingmayer/amazonreviews


Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 99% 490M/493M [00:24<00:00, 26.3MB/s]
100% 493M/493M [00:24<00:00, 20.8MB/s]


In [None]:
!unzip amazonreviews.zip

Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [None]:
!mkdir dataset
!bunzip2 -c /content/test.ft.txt.bz2 > /content/dataset/test.ft.txt
!bunzip2 -c /content/train.ft.txt.bz2 > /content/dataset/train.ft.txt

In [None]:
!rm test.ft.txt.bz2
!rm train.ft.txt.bz2
!rm amazonreviews.zip

In [None]:
import gensim.downloader as api

In [None]:
#check the list of models available
info = api.info()
for model_name, model_info in sorted(info['models'].items()):
    print(
        "%s (%d records): %s" % (
            model_name,
            model_info.get('num_records', -1),
            model_info['description'][:40] +"...",
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

In [None]:
glove_model = api.load('glove-twitter-50')



In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

**!Load and preoprocess data**

In [None]:
train_dir = '/content/dataset/train.ft.txt'
test_dir = '/content/dataset/test.ft.txt'

In [None]:
class TextDataGenerator:
    def __init__(self, data_dir):
        self.path = data_dir
        self.pattern = r'^(__label__\d+)\s+(.*)$'
        self.file = open(self.path, mode='r', encoding='utf-8')  # Open the file in read mode

    def __iter__(self):
        return self

    def __next__(self):
        line = self.file.readline()
        if not line:
            self.file.seek(0) # upon exhausting the file, start from the begining again
            raise StopIteration

        match = re.match(self.pattern, line)
        if match:
            label = match.group(1).split('_')[-1]  # Extract label, removing "__label__"
            label = 1 if label == '2' else 0 # convert label to 0 or 1
            text = match.group(2).strip()  # Extract text, remove leading/trailing whitespace

        return (label, text)

    def __del__(self):
        if hasattr(self, 'file') and self.file:
            self.file.close()


In [None]:
class Dataset:

    def __init__(self, dir, dirname, batch = 64, wv = glove_model, ops =  'sum'):
        self.path = dir
        self.name = dirname
        self.text_gen = TextDataGenerator(self.path)
        self.nltk_stopwords = set(stopwords.words('english'))
        self.nltk_punctuation = set(string.punctuation)
        self.batch_size = batch
        self.df = pd.DataFrame(columns=['text', 'label'])
        self.lemmatizer = WordNetLemmatizer()
        self.word_vectors = wv
        self.op = ops


    def __iter__(self):
        return self


    def __get_embeddings__(self, words ) :

        """Get embeddings of each word in a list"""
        if self.op == 'sum' :
            embedding_vector = []
            for word in words :
                if word in self.word_vectors:
                    embedding_vector.append( self.word_vectors[word] )
                else :
                    embedding_vector.append( np.zeros(self.word_vectors.vector_size) )
            embedding_vector = np.sum( np.array(embedding_vector), axis = 0)

        elif self.op == 'mean' :
            embedding_vector = []
            for word in words :
                if word in self.word_vectors:
                    embedding_vector.append( self.word_vectors[word] )
                else :
                    embedding_vector.append( np.zeros(self.word_vectors.vector_size) )
            embedding_vector = np.sum( np.array(embedding_vector), axis = 0)
            embedding_vector = embedding_vector / len(words)

        return embedding_vector

    def __preprocess(self, sentence) :

        """Word tokenizes, removes stopwords and punctuation , apply lemmetization"""

        #remove stop words and punctuation
        words = nltk.word_tokenize(sentence)
        filtered_words = [word for word in words if word not in self.nltk_stopwords and word not in self.nltk_punctuation]

        ##apply lemmetization
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in filtered_words]
        return lemmatized_words

    def __next__(self):

        processed_data = []
        labels = []
        for i in range(self.batch_size) :

            #Get a single item from the generator
            label, text = next(self.text_gen)

            # Tokenize text into sentences
            sentences = nltk.sent_tokenize(text.lower())

            # Tokenize each sentence into words, filter out stopwords and punctuation
            filtered_sentences = []
            for sentence in sentences:
                filtered_words = self.__preprocess(sentence)
                filtered_sentences.extend(filtered_words)
                embeddings = self.__get_embeddings__(filtered_sentences)

            # Append the preprocessed data to the list
            processed_data.append(embeddings)
            labels.append(label)

        return (processed_data, labels)



**SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [None]:
svm_data_train = Dataset(train_dir, 'train', batch = 5000)
svm_data_test = Dataset(test_dir, 'test', batch = 1000)

In [None]:
X, y = svm_data_train.__next__()
X_t, y_t = svm_data_test.__next__()

In [None]:
clf = SVC(kernel = 'rbf', C=100, gamma=0.001)
clf.fit(X, y)

print( clf.score(X_t, y_t) )

0.731


In [None]:
params_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf','linear','poly']
}

random_search = RandomizedSearchCV(SVC(), params_grid, n_iter=3, cv=3,verbose = 2)
random_search = random_search.fit(X, y)

In [None]:
random_search.best_score_

In [None]:
best_clf = random_search.best_estimator_
best_clf.score(X_t, y_t)

Using Incremental learning to fit the entire dataset consisting of 3 million + examples
#each batch will consist of 5000 sample , will fit the SGD for 100 batches

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
data_stream = Dataset(train_dir, 'train',batch=5000)
test_data = Dataset(test_dir, 'test', batch = 1000)

In [None]:
X, y = data_stream.__next__()
X_batch, y_batch = test_data.__next__()

In [None]:
model = SGDClassifier(loss='hinge', penalty='l1', alpha=1e-7, random_state=42, max_iter=1000, tol=None)
model.partial_fit(X, y, classes=np.unique(y) )

print(f"Base model {model.score(X_batch, y_batch)}")

Base model 0.702


In [None]:
accuracies = []

for idx in range(100) :

    # Predict with current model
    # y_pred = model.predict(X)

    # # Compute accuracy before updating
    # accuracy_before = accuracy_score(y, y_pred)
    # accuracies.append(accuracy_before)

    X, y = data_stream.__next__()

    batch_num = idx

    # Update the model with the new data batch
    model.partial_fit(X, y, classes=np.unique(y))

    # Predict again after updating (optional, depending on your needs)
    y_pred_after = model.predict(X_batch)

    # Compute accuracy after updating (optional, depending on your needs)
    accuracy_after = accuracy_score(y_batch, y_pred_after)
    accuracies.append(accuracy_after)

    if not batch_num % 10 :
        print(f" batch num : {batch_num + 1 } Accuracy : {accuracy_after:.4f}")


 batch num : 1 Accuracy : 0.5780
 batch num : 11 Accuracy : 0.7150
 batch num : 21 Accuracy : 0.7350
 batch num : 31 Accuracy : 0.7180
 batch num : 41 Accuracy : 0.7040
 batch num : 51 Accuracy : 0.6530
 batch num : 61 Accuracy : 0.7050
 batch num : 71 Accuracy : 0.5790
 batch num : 81 Accuracy : 0.7420
 batch num : 91 Accuracy : 0.7240
