In [0]:
!pip install PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

fileId = drive.CreateFile({'id': '#'}) 
print(fileId['title'])  # dataset.zip
fileId.GetContentFile('temp.zip')  # Save Drive file as a local file

!unzip temp.zip -d ./

Collecting PyDrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K     |████████████████████████████████| 993kB 3.5MB/s 
Building wheels for collected packages: PyDrive
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built PyDrive
Installing collected packages: PyDrive
Successfully installed PyDrive-1.3.1
imdb_master.zip
Archive:  temp.zip
  inflating: ./imdb_master.csv       


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
df = pd.read_csv('imdb_master.csv', encoding = "ISO-8859-1")
del df['Unnamed: 0']
del df['file']
del df['type']

In [0]:
df = df.loc[0:49999]
df = df.sample(frac=1).reset_index(drop=True)

In [0]:
def clean_reviews(text):
    lemmatizer = WordNetLemmatizer()
    my_stopwords = stopwords.words('english') 
    text = text.replace("<br >", "")
    text = text.replace("</br >", "")        
    text = re.sub('[^a-zA-Z]',' ', text)
    text = text.lower() 
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in my_stopwords]
    text = " ".join(text)
    return text  

In [0]:
df["review"]=df.review.apply(lambda x: clean_reviews(x))

In [0]:
df.head()

Unnamed: 0,review,label
0,bleed apt title watch feel life bleed clich ...,neg
1,camp blood absolutely atrocious slasher film ...,neg
2,film great watch friend think wa proof film ...,neg
3,know neighborhood folk write rave review movie...,neg
4,film really cool every thing look like come c...,pos


In [0]:
df["label"] = df["label"].map({'pos': 1, 'neg': 0})
df.head()

Unnamed: 0,review,label
0,bleed apt title watch feel life bleed clich ...,0
1,camp blood absolutely atrocious slasher film ...,0
2,film great watch friend think wa proof film ...,0
3,know neighborhood folk write rave review movie...,0
4,film really cool every thing look like come c...,1


In [0]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["label"], test_size=0.2)

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Bidirectional
from keras.layers.embeddings import Embedding
import string

In [0]:
vocabulary_size = 10000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
data = pad_sequences(sequences, maxlen=150)


In [0]:
model = Sequential()
model.add(Embedding(10000, 100, input_length=150))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(data, df['label'], validation_split=0.2, epochs=10)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 40000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc8893c2cc0>

In [0]:
model_conv = Sequential()
model_conv.add(Embedding(vocabulary_size, 128, input_length=150))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(Bidirectional(LSTM(200, dropout=0.2, recurrent_dropout=0.2)))
model_conv.add(Dense(1, activation='sigmoid'))
model_conv.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])

model_conv.fit(data, df['label'], validation_split=0.2, epochs=10)

Train on 40000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc732420be0>