# Introduction to Natrual Language Processing: Sentiment Analysis

### Import Libaries

In [1]:
import nltk
import sklearn
import wordcloud

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Download Data

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar -xzf aclImdb_v1.tar.gz

### Load Data

In [3]:
# Load Training Data

import pathlib

train_pos = pathlib.Path.cwd() / "aclImdb" / "train" / "pos"
train_neg = pathlib.Path.cwd() / "aclImdb" / "train" / "neg"

negative_texts = []
i = 0
for file_path in train_neg.glob('*.txt'):
    if i < 5000:
        with open(file_path, 'r') as f:
            negative_texts.append(f.read())
        i += 1
negative_labels = [0] * len(negative_texts)

positive_texts = []
i = 0
for file_path in train_pos.glob('*.txt'):
    if i < 5000:
        with open(file_path, 'r') as f:
            positive_texts.append(f.read())
        i += 1
positive_labels = [1] * len(positive_texts)

texts = positive_texts + negative_texts
labels = positive_labels + negative_labels

### Tokenization

In [4]:
from nltk.tokenize import word_tokenize

#### Split Words

In [5]:
tokenized_texts = [word_tokenize(text) for text in texts]

In [6]:
tokenized_texts[0]

['For',
 'a',
 'movie',
 'that',
 'gets',
 'no',
 'respect',
 'there',
 'sure',
 'are',
 'a',
 'lot',
 'of',
 'memorable',
 'quotes',
 'listed',
 'for',
 'this',
 'gem',
 '.',
 'Imagine',
 'a',
 'movie',
 'where',
 'Joe',
 'Piscopo',
 'is',
 'actually',
 'funny',
 '!',
 'Maureen',
 'Stapleton',
 'is',
 'a',
 'scene',
 'stealer',
 '.',
 'The',
 'Moroni',
 'character',
 'is',
 'an',
 'absolute',
 'scream',
 '.',
 'Watch',
 'for',
 'Alan',
 '``',
 'The',
 'Skipper',
 "''",
 'Hale',
 'jr.',
 'as',
 'a',
 'police',
 'Sgt',
 '.']

#### Create Vocabulary

In [7]:
vocab = set([token for text in tokenized_texts for token in text])

In [8]:
len(vocab)

81565

#### Lowercase and Remove Punctuation

In [9]:
vocab = set([token.lower() for token in vocab if token.isalnum()])

In [10]:
len(vocab)

50455

In [11]:
tokenized_texts = [[token.lower() for token in tokens  if token.isalnum()] for tokens in tokenized_texts]

In [12]:
len(tokenized_texts)

10000

In [13]:
tokenized_texts[0]

['for',
 'a',
 'movie',
 'that',
 'gets',
 'no',
 'respect',
 'there',
 'sure',
 'are',
 'a',
 'lot',
 'of',
 'memorable',
 'quotes',
 'listed',
 'for',
 'this',
 'gem',
 'imagine',
 'a',
 'movie',
 'where',
 'joe',
 'piscopo',
 'is',
 'actually',
 'funny',
 'maureen',
 'stapleton',
 'is',
 'a',
 'scene',
 'stealer',
 'the',
 'moroni',
 'character',
 'is',
 'an',
 'absolute',
 'scream',
 'watch',
 'for',
 'alan',
 'the',
 'skipper',
 'hale',
 'as',
 'a',
 'police',
 'sgt']

### Stemming

In [14]:
from nltk.stem.snowball import SnowballStemmer

In [15]:
stemmer = SnowballStemmer('english')

In [16]:
stemmer.stem('movies')

'movi'

### Lemmatization

In [17]:
from nltk.corpus import wordnet

In [18]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [19]:
wnl.lemmatize('movies')

'movie'

In [None]:
vocab = set([wnl.lemmatize(token) for token in vocab if wordnet.synsets(token)])

In [None]:
len(vocab)

In [None]:
lemmatized_texts = [[wnl.lemmatize(token) for token in tokens  if wordnet.synsets(token)] for tokens in tokenized_texts]

In [None]:
lemmatized_texts[0]

### Stop Word Removal

#### Find most frequent tokens

In [None]:
all_tokens = [token for tokens in lemmatized_texts for token in tokens]

In [None]:
freq = nltk.FreqDist(all_tokens)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 5))  
freq.plot(50, cumulative=False)

In [None]:
stop_words = freq.most_common()[:50]
stop_words = list(zip(*stop_words))[0]
print(stop_words)

#### Get stopwords from NLTK list

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

In [None]:
stop_words = set(stop_words).union(set(stopwords.words('english')))

#### Remove stopwords from vocab

In [None]:
vocab = set([token for token in vocab if token not in stop_words])

In [None]:
len(vocab)

### Count Vectors

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vocab = list(vocab)

In [None]:
vectorizer = CountVectorizer(vocabulary=vocab)

In [None]:
count_vectors = vectorizer.fit_transform(texts)

In [None]:
count_vectors.shape

### Train Model

In [None]:
from sklearn.utils import shuffle
X, y = shuffle(count_vectors, labels, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0).fit(X, y)

### Load Test Data

In [None]:
test_pos = pathlib.Path.cwd() / "aclImdb" / "test" / "pos"
test_neg = pathlib.Path.cwd() / "aclImdb" / "test" / "neg"

test_negative_texts = []
i = 0
for file_path in test_neg.glob('*.txt'):
    if i < 1000:
        with open(file_path, 'r') as f:
            test_negative_texts.append(f.read())
        i += 1
test_negative_labels = [0] * len(test_negative_texts)

test_positive_texts = []
i = 0
for file_path in test_pos.glob('*.txt'):
    if i < 1000:
        with open(file_path, 'r') as f:
            test_positive_texts.append(f.read())
        i += 1
test_positive_labels = [1] * len(test_positive_texts)

test_texts = test_positive_texts + test_negative_texts
test_labels = test_positive_labels + test_negative_labels

### Evaluate Accuracy

In [None]:
X_test = vectorizer.transform(test_texts)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Test Accuracy: %.2f%%" % sklearn.metrics.accuracy_score(test_labels, y_pred))

### Feature Importance

In [None]:
importance = model.coef_[0]

In [None]:
positive_words = dict()
for i, word in enumerate(vocab):
    positive_words[word] = importance[i]

In [None]:
cloud = wordcloud.WordCloud().generate_from_frequencies(positive_words)
plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
negative_words = dict()
for i, word in enumerate(vocab):
    negative_words[word] = -importance[i]

In [None]:
cloud = wordcloud.WordCloud().generate_from_frequencies(negative_words)
plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()