## Getting started

#### Standard imports and installations

In [None]:
!pip3 install hub pandas numpy tqdm sklearn

#### To be able to create datasets or download from hub, please create an account by visiting this link:  
#### https://app.activeloop.ai/

In [None]:
# Use the username & password used to register on hub here to login
!hub login -u <username> -p <password>

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import hub
from hub.schema import Text, ClassLabel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz --quiet
!tar -xf aclImdb_v1.tar.gz

Reading one sample review

In [None]:
filename = "aclImdb/train/pos/0_9.txt"
with open(filename, "r") as fin:
    line = fin.readline()
fin.close()

In [None]:
line

#### Collecting all filenames for processing

In [None]:
file_names = []
reviews_df = pd.DataFrame(columns=["Review", "Label"])
for root, dirs, files in os.walk("aclImdb/train/pos"):
    file_names.append(files)

#### Appending all positive reviews to the DataFrame

In [None]:
root_dir = "aclImdb/train/pos/"
count = 0
for i in file_names[0]:
    with open(root_dir + i, "r") as fin:
        reviews_df = reviews_df.append(
            {"Review": fin.readline(), "Label": 1}, ignore_index=True
        )
        count += 1
fin.close()

In [None]:
reviews_df

#### Appending all negative reviews to the DataFrame

In [None]:
file_names = []
for root, dirs, files in os.walk("aclImdb/train/neg"):
    file_names.append(files)

In [None]:
root_dir = "aclImdb/train/neg/"
count = 0
for i in file_names[0]:
    with open(root_dir + i, "r") as fin:
        reviews_df = reviews_df.append(
            {"Review": fin.readline(), "Label": 0}, ignore_index=True
        )
        count += 1
fin.close()

In [None]:
max_length = 0
for i in reviews_df["Review"]:
    if len(i) > max_length:
        max_length = len(i)

### Uploading the DataFrame to Hub

In [None]:
# Please run this cell only once. Once you have uploaded the dataset, you can simply fetch it by running
# hub.Dataset(url)

# Replace url with your username and dataset name. for example, if your name is Akash and your dataset is
# FlipkartReviews, then
# url = Akash/FlipkartReviews
# Before you can upload datasets, please login into Hub. Run the first cell.

url = "<your username>/IMDB-MovieReviews"

# Uncomment the following lines if you"re uploading *this* dataset for the first time.
my_schema = {"Review": Text(shape=(None, ), max_shape=(max_length, )),
             "Label": ClassLabel(num_classes=2)}

ds = hub.Dataset(url, shape=(25000,), schema=my_schema)
for i in tqdm(range(len(ds))):
    ds["Review", i] = reviews_df["Review"][i]
    ds["Label", i] = reviews_df["Label"][i]

In [None]:
# Comment out the following line if you"re uploading the dataset for the first time.
ds = hub.Dataset(url)

#### Flushing dataset to disk

In [None]:
# If you"ve gone ahead and uploaded your own dataset into Hub, run this command.
# This command saves all changes to the cloud. You can also view this dataset at
# https://app.activeloop.ai

ds.flush()

## Fetching data from Hub

In [None]:
print(type(ds))
print(ds.schema)

print(ds["Review", 4].compute())
print(ds["Label", 4].compute())

## Training a model with our dataset

In [None]:
import re


def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    return text


preprocessor("This is a :) test :-( !")

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()


def tokenizer(text):
    return text.split()


tokenizer("I find it fun to use Hub")

In [None]:
def tokenizer_stemmer(text):
    return [porter.stem(word) for word in text.split()]


tokenizer_stemmer("Hub is extremely easy and efficient to use")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    strip_accents=None,
    lowercase=True,
    preprocessor=preprocessor,
    tokenizer=tokenizer_stemmer,
    use_idf=True,
    norm="l2",
    smooth_idf=True,
)
X = tfidf.fit_transform(
    [item["Review"].compute() for item in ds]
)  # Our training dataset
y = ds["Label"].compute()  # Training Labels

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, test_size=0.5, shuffle=True
)
clf = LogisticRegressionCV(
    cv=5, scoring="accuracy", random_state=0, n_jobs=-1, verbose=3, max_iter=300
).fit(X_train, y_train)

In [None]:
print(f"Accuracy: {clf.score(X_test, y_test)}")