In [1]:
# !pip install nltk

In [2]:
import pandas as pd 
import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
df = pd.read_csv('datasets/IMDB Dataset.csv').sample(1000)
df.head()

Unnamed: 0,review,sentiment
32240,"As other reviewers have noted, this is an unju...",positive
8307,There were a lot of things going against this ...,positive
967,Life Begins - and ends - in a typical 1930's m...,positive
10868,"Love it, love it, love it! This is another abs...",positive
32183,"Well, the Hero and the Terror is slightly belo...",negative


### countVectorizer : bag of words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# create a corpus of sentences
corpus = [
 "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]

In [6]:
cv = CountVectorizer()
cv.fit(corpus)

In [7]:
transfomed = cv.transform(corpus)
print(transfomed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [8]:
print(cv.vocabulary_)

{'hello': 9, 'how': 11, 'are': 2, 'you': 22, 'im': 13, 'getting': 8, 'bored': 4, 'at': 3, 'home': 10, 'and': 1, 'what': 19, 'do': 7, 'think': 17, 'did': 6, 'know': 14, 'about': 0, 'counts': 5, 'let': 15, 'see': 16, 'if': 12, 'this': 18, 'works': 20, 'yes': 21}


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
corpus = [
 "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]

In [10]:
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)


{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


# training the sentiment analysis 

In [11]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
df.head()

Unnamed: 0,review,sentiment
32240,"As other reviewers have noted, this is an unju...",positive
8307,There were a lot of things going against this ...,positive
967,Life Begins - and ends - in a typical 1930's m...,positive
10868,"Love it, love it, love it! This is another abs...",positive
32183,"Well, the Hero and the Terror is slightly belo...",negative


In [13]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x =='positive' else 0)  

In [14]:
df['kfold'] = -1
df = df.sample(frac=1).reset_index(drop=True)
y = df['sentiment'].values
kf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

In [15]:
for fold_ in range(5):
    train_df = df[df['kfold'] != fold_].reset_index(drop=True)
    test_df = df[df['kfold'] == fold_].reset_index(drop=True)
    count_vec = CountVectorizer(token_pattern=None, tokenizer=word_tokenize) 
    count_vec.fit(train_df.review)

    x_train = count_vec.transform(train_df.review)
    x_test = count_vec.transform(test_df.review)

    model = linear_model.LogisticRegression()
    model.fit(x_train, train_df['sentiment']) 
    
    preds = model.predict(x_test) 
    acc = metrics.accuracy_score(test_df['sentiment'], preds) 

    print(f" {fold_} : {acc}")

 0 : 0.81
 1 : 0.76
 2 : 0.785
 3 : 0.77
 4 : 0.77


# NAiveBayes 

In [16]:
from sklearn import naive_bayes

In [17]:
for fold_ in range(5):
    train_df = df[df['kfold'] != fold_].reset_index(drop=True)
    test_df = df[df['kfold'] == fold_].reset_index(drop=True)
    
    count_vec = CountVectorizer(token_pattern=None, tokenizer=word_tokenize) 
    count_vec.fit(train_df.review)

    x_train = count_vec.transform(train_df.review)
    x_test = count_vec.transform(test_df.review)

    model = naive_bayes.MultinomialNB()
    model.fit(x_train, train_df['sentiment']) 
    
    preds = model.predict(x_test) 
    acc = metrics.accuracy_score(test_df['sentiment'], preds) 

    print(f" {fold_} : {acc}")

 0 : 0.805
 1 : 0.82
 2 : 0.75
 3 : 0.84
 4 : 0.775


### Tf-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)


In [20]:
for fold_ in range(5):
    train_df = df[df['kfold'] != fold_].reset_index(drop=True)
    test_df = df[df['kfold'] == fold_].reset_index(drop=True)
    
    tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
    tfv.fit(train_df.review)

    x_train = count_vec.transform(train_df.review)
    x_test = count_vec.transform(test_df.review)

    model = naive_bayes.MultinomialNB()
    model.fit(x_train, train_df['sentiment']) 
    
    preds = model.predict(x_test) 
    acc = metrics.accuracy_score(test_df['sentiment'], preds) 

    print(f" {fold_} : {acc}")

 0 : 0.81
 1 : 0.815
 2 : 0.75
 3 : 0.83
 4 : 0.775


# N-gram 

In [21]:
from nltk import ngrams 

In [22]:
# let's see 3 grams
N = 3
# input sentence
sentence = "hi, how are you?"
# tokenized sentence
tokenized_sentence = word_tokenize(sentence)
# generate n_grams
n_grams = list(ngrams(tokenized_sentence, N))
print(n_grams)

[('hi', ',', 'how'), (',', 'how', 'are'), ('how', 'are', 'you'), ('are', 'you', '?')]


In [23]:

for fold_ in range(5):
    train_df = df[df['kfold'] != fold_].reset_index(drop=True)
    test_df = df[df['kfold'] == fold_].reset_index(drop=True)
    
    tfv = tfidf_vec = TfidfVectorizer(
                                        tokenizer=word_tokenize,
                                        token_pattern=None,
                                        ngram_range=(1, 3)
                                        )
    tfv.fit(train_df.review)

    x_train = count_vec.transform(train_df.review)
    x_test = count_vec.transform(test_df.review)

    model = naive_bayes.MultinomialNB()
    model.fit(x_train, train_df['sentiment']) 
    
    preds = model.predict(x_test) 
    acc = metrics.accuracy_score(test_df['sentiment'], preds) 

    print(f" {fold_} : {acc}")

 0 : 0.81
 1 : 0.815
 2 : 0.75
 3 : 0.83
 4 : 0.775


# lemmatization and stemming 

In [24]:
# from nltk.stem import WordNetLemmatizer
# from nltk.stem.snowball import SnowballStemmer

In [25]:
# import nltk
# nltk.download('wordnet')

In [26]:
# lemmatizer = WordNetLemmatizer()
# stemmer = SnowballStemmer("english")
# words = ["fishing", "fishes", "fished"]
# for word in words:
#     print(f"word={word}")
#     print(f"stemmed_word={stemmer.stem(word)}")
#     print(f"lemma={lemmatizer.lemmatize(word)}")
#     print("")

# MAtrix decomposition 

In [27]:
corpus = df.review.values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)


In [28]:
from sklearn.decomposition import TruncatedSVD

In [29]:
svd = TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

In [30]:
sample_index = 0
feature_scores = dict(
 zip(
 tfv.get_feature_names_out(),
 corpus_svd.components_[sample_index]
 )
)

N = 5
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', ',', '.', 'a', 'and']


In [31]:
N = 5
for sample_index in range(5):
 feature_scores = dict(
 zip(
 tfv.get_feature_names_out(),
 corpus_svd.components_[sample_index]
 )
 )
 print(
 sorted(
 feature_scores,
 key=feature_scores.get,
 reverse=True
 )[:N]
 )

['the', ',', '.', 'a', 'and']
['>', '<', '/', 'br', '...']
['i', 'movie', '!', 'was', 'it']
['!', ',', '?', ')', '(']
['!', 'the', 'is', 'of', 'and']


In [32]:
import re
import string

def clean_text(s):
    # split by all whitespaces
    s = s.split()

    # join tokens by single space
    # why we do this?
    # this will remove all kinds of weird space
    # "hi. how are you" becomes
    # "hi. how are you"
    s = " ".join([x for x in s if len(x)>3])

    # remove all punctuations using regex and string module
    s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)

    # you can add more cleaning here if you want
    # and then return the cleaned string
    return s


In [33]:
df.loc[:, "review"] = df.review.apply(clean_text)

corpus = df.review.values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)

svd = TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

In [34]:
N = 5
for sample_index in range(5):
 feature_scores = dict(
 zip(
 tfv.get_feature_names_out(),
 corpus_svd.components_[sample_index]
 )
 )
 print(
 sorted(
 feature_scores,
 key=feature_scores.get,
 reverse=True
 )[:N]
 )

['this', 'that', 'movie', 'br', 'film']
['movie', 'this', 'movies', 'dont', 'think']
['br', 'the', 'movie', 'they', 'show']
['br', 'film', 'this', 'worst', 'acting']
['they', 'show', 'that', 'have', 'dont']


# Word Embeddings 

In [35]:
import numpy as np 

In [36]:
def sentence_to_vec(s, embedding_dict, stop_words=None, tokenizer=None):
    words = str(s).lower()
    words = tokenizer(words)
    words = [w for w in words if w not in stop_words] 
    words = [w for w in words if w.isalpha()] 
    M = [] 
    for w in words:
        res = embedding_dict.get(w, None) 
        if res is not None:
            M.append(res)

    if len(M) == 0:
        return np.zeros(300) 

    M = np.array(M)
    v = M.sum(axis=0)
    return v/np.sqrt((v**2).sum())



In [37]:
import io
from tqdm import tqdm

In [38]:
def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(
                fname,
                'r',
                encoding='utf-8',
                newline='\n',
                errors='ignore'
                )
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [39]:
df = pd.read_csv('datasets/IMDB Dataset.csv').sample(1000) 
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0) 
df = df.sample(frac=1).reset_index(drop=True) 
print("loading embeddings")
# embeddings = load_vectors("datasets/wiki-news-300d-1M.vec") 


loading embeddings


999994it [04:32, 3671.68it/s]


In [40]:
print("create sentence vectors") 
vectors = [] 
for review in df['review'].to_list():
    vectors.append(
        sentence_to_vec(
            review, 
            embedding_dict=embeddings, 
            stop_words=[],
            tokenizer=word_tokenize
            )
    )

vectors = np.array(vectors)
y = df['sentiment'].values

create sentence vectors


In [48]:
kf = model_selection.StratifiedKFold(n_splits=5)

for fold_, (t_, v_) in enumerate(kf.split(X=vectors, y=y)):
    xtrain = vectors[t_, :]
    ytrain = y[t_]

    xtest = vectors[v_, :]
    ytest = y[v_]

    model = linear_model.LogisticRegression()
    model.fit(xtrain, ytrain)

    preds = model.predict(xtest) 
    acc = metrics.accuracy_score(ytest, preds) 
    print(f" {fold_} : {acc}")

 0 : 0.735
 1 : 0.75
 2 : 0.645
 3 : 0.735
 4 : 0.74


# Bi-Directional LSTM