In [5]:
!wget -q "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!tar xzf aclImdb_v1.tar.gz

In [41]:
def load_data(path):
    data, sentiments=[], []
    for folder, sentiment in (('neg',0),('pos',1)):
        folder = os.path.join(path,folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder,name),'r',encoding="utf8") as reader:
                text=reader.read()
            text=tokenize(text)
            text=stop_words_removal(text)
            text=reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np=np.array(data)
    data, sentiments=unison_shuffle_data(data_np,sentiments)
    
    return data, sentiments

In [42]:
Nsamp=1000
maxtokens=50
maxtokenlen=20

In [43]:
import numpy as np
import pandas as pd
import os

def tokenize(row):
    if row in [None,'']:
        tokens=""
    else:
        tokens=str(row).split(" ")[:maxtokens]
    return tokens

In [44]:
import re
def reg_expressions(row):
    tokens=[]
    try:
        for token in row:
            token=token.lower()
            token=re.sub(r'[\W\d]', "", token)
            token=token[:maxtokenlen]
            tokens.append(token)
    except:
        token=""
        tokens.append(token)
    return tokens

In [45]:
import nltk

nltk.download('stopword')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

def stop_words_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index


In [46]:
def assemble_bag(data):
    used_tokens=[]
    all_tokens=[]
    
    for item in data:
        for token in item:
            if token in all_tokens:
                if token not in used_tokens:
                    used_tokens.append(token)
            else:
                all_tokens.append(token)
    df=pd.DataFrame(0, index=np.arange(len(data)), columns=used_tokens)
    
    for i, item in enumerate(data):
        for token in item:
            if token in used_tokens:
                df.iloc[i][token] += 1
    return df

In [47]:
def unison_shuffle_data(data,header):
    p=np.random.permutation(len(header))
    data=data[p]
    header=np.asarray(header)[p]
    return data,header

In [48]:
train_path = os.path.join('aclImdb','train')
raw_data, raw_header = load_data(train_path)

  del sys.path[0]


In [49]:
print(raw_data.shape)
print(len(raw_header))

(25000,)
25000


In [50]:
random_indices = np.random.choice(range(len(raw_header)), size=(Nsamp*2,), replace=False)
data_train = raw_data[random_indices]
header = raw_header[random_indices]

In [51]:
unique_elements, counts_elements = np.unique(header, return_counts=True)
print("Sentiments abd their frequencies:")
print(unique_elements)
print(counts_elements)

Sentiments abd their frequencies:
[0 1]
[1022  978]


In [52]:
MixedBagOfReviews=assemble_bag(data_train)
print(MixedBagOfReviews)

      br  series  time  someone  i  one  boring  great  movie  play  ...  \
0      2       2     1        0  0    0       0      0      0     0  ...   
1      0       0     1        2  0    0       0      1      1     0  ...   
2      0       0     0        0  2    2       3      1      1     0  ...   
3      0       0     0        0  0    1       0      0      0     2  ...   
4      0       0     1        0  0    0       0      0      0     0  ...   
...   ..     ...   ...      ... ..  ...     ...    ...    ...   ...  ...   
1995   1       0     0        0  0    0       0      0      1     0  ...   
1996   0       0     0        0  2    0       0      0      0     0  ...   
1997   0       0     0        0  1    0       0      0      0     0  ...   
1998   1       0     0        0  0    0       1      0      1     0  ...   
1999   0       0     0        0  3    0       0      0      0     0  ...   

      genuine  vader  melissa  lighthearted  undertones  atrocity  bridge  \
0         

In [65]:
def convert_data(raw_data,header):
    converted_data, labels = [], []
    for i in range(raw_data.shape[0]):
        # combine list of tokens representing each email into single string
        out = ' '.join(raw_data[i])
        converted_data.append(out)
        labels.append(header[i])
    converted_data = np.array(converted_data, dtype=object)[:, np.newaxis]
    
    return converted_data, np.array(labels)


data = MixedBagOfReviews.values

idx = int(0.7*data.shape[0])

# 70% of data for training
train_x = data[:idx,:]
train_y = header[:idx]
# remaining 30% for testing
test_x = data[idx:,:]
test_y = header[idx:] 

In [66]:
from sklearn.linear_model import LogisticRegression

def fit(train_x, train_y):
    model=LogisticRegression()
    
    try:
        model.fit(train_x,train_y)
    except:
        pass
    return model

In [67]:
model=fit(train_x, train_y)

In [68]:
predicted_labels = model.predict(test_x)

from sklearn.metrics import accuracy_score

acc_score = accuracy_score(test_y, predicted_labels)
print("The Logistic Regression accuracy score is::")
print(acc_score)

The Logistic Regression accuracy score is::
0.69


In [None]:
import time
from sklearn.svm import SVC

clf=SVC(C=1, gamma="auto", kernel='linear', probability=False)
start_time = time.time()
clf.fit(train_x,train_y)
end_time=time.time()
print("Training the SVC Classifier took %3d seconds"testing accuracy score is::")
predicted_labels=clf.predict(test_x)
acc_score=accuracy_score(test_y, predicted_labels)
print("The SVC Classifier testing accuracy score is::")
print(acc_score)