In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from nltk.corpus import stopwords

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [6]:
df = pd.read_csv('./IMDB Dataset.csv')

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [9]:
#introduce stopwords
stopwords = set(stopwords.words('english'))

In [10]:
X = df['review']
y = df['sentiment']

In [11]:
#remove stopwords
X = X.apply(lambda review: [w for w in review.split() if w not in stopwords])

In [12]:
#convert to all lower case
X = X.apply(lambda review: [w.lower() for w in review])

In [13]:
#remove html tags
X = X.replace({'<.*?>': ''}, regex=True)

In [14]:
#remove non-alphabetical characters:
X = X.replace({'[^A-Za-z]': ''}, regex=True)

In [15]:
#change polarity labels into numbers:
y = y.replace('positive', 1)
y = y.replace('negative', 0)

In [16]:
X.head()

0    [one, reviewers, mentioned, watching, 1, oz, e...
1    [a, wonderful, little, production., <br, /><br...
2    [i, thought, wonderful, way, spend, time, hot,...
3    [basically, there's, family, little, boy, (jak...
4    [petter, mattei's, "love, time, money", visual...
Name: review, dtype: object

In [17]:
y.head()

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64

In [18]:
#train-test split of the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [19]:
token = Tokenizer(lower=False)
token.fit_on_texts(X_train)

In [20]:
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

max_length = 300

X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

In [21]:
total_words = len(token.word_index) + 1

In [22]:
#build a lstm model:
EMBED_DIM = 32
LSTM_OUT = 64

#build and add each layer:
lstm = Sequential()
lstm.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
lstm.add(LSTM(LSTM_OUT))
lstm.add(Dense(1, activation='sigmoid'))
lstm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


In [23]:
#lstm.compile(optimizer, loss)

NameError: name 'optimizer' is not defined

In [None]:
lstm.fit(X_train, y_train, batch_size = 128, epochs = 20)

Epoch 1/20
Epoch 2/20

In [None]:
y_pred = lstm.predict_classes(X_test, batch_size=128)

In [None]:
true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true +=1
        
print('LSTM Accuracy: {}'.format(true/len(y_pred)*100))

In [None]:
#Logistic Regression
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count = CountVectorizer()
bag = count.fit_transform(docs)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer


In [None]:
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

In [None]:
print(tfidf.fit_transform(bag).toarray())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, tokenizer=tokenizer_porter, use_idf=True, norm='l2', smooth_idf=True)

In [None]:
y = df.sentiment.values
X = tfidf.fit_transform(df.review)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size=0.5)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
clf = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=0,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)