# Introduction to Natrual Language Processing: Sentiment Analysis

### Import Libaries

In [None]:
import nltk
import sklearn
import wordcloud

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Download Data

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar -xzf aclImdb_v1.tar.gz

### Load Data

In [None]:
import pathlib

In [None]:
# Load Training Data

train_pos = pathlib.Path.cwd() / "aclImdb" / "train" / "pos"
train_neg = pathlib.Path.cwd() / "aclImdb" / "train" / "neg"

negative_texts = []
i = 0
for file_path in train_neg.glob('*.txt'):
    if i < 5000:
        with open(file_path, 'r') as f:
            negative_texts.append(f.read())
        i += 1
negative_labels = [0] * len(negative_texts)

positive_texts = []
i = 0
for file_path in train_pos.glob('*.txt'):
    if i < 5000:
        with open(file_path, 'r') as f:
            positive_texts.append(f.read())
        i += 1
positive_labels = [1] * len(positive_texts)

texts = positive_texts + negative_texts
labels = positive_labels + negative_labels

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
tokenized_texts = [word_tokenize(text) for text in texts]

### Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
stemmer.stem('movies')

### Lemmatization

In [None]:
from nltk.corpus import wordnet

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [None]:
wnl.lemmatize('movies')

In [None]:
stemmed_texts = [[wnl.lemmatize(token) for token in text if wordnet.synsets(token)] for text in tokenized_texts]

### Stop Word Removal

In [None]:
all_tokens = [token for tokens in stemmed_texts for token in tokens]

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 5))  
freq.plot(50, cumulative=False)

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

In [None]:
freq = nltk.FreqDist(all_tokens)
stop_words = freq.most_common()[:50]
stop_words = list(zip(*stop_words))[0]
vocab = [token for token in set(all_tokens) if token not in stop_words or token not in stopwords.words('english')]

### Count Vectors

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(vocabulary=vocab)

In [None]:
count_vectors = vectorizer.fit_transform(texts)

### Train Model

In [None]:
from sklearn.utils import shuffle
X, y = shuffle(count_vectors, labels, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0).fit(X, y)

### Load Test Data

In [None]:
test_pos = pathlib.Path.cwd() / "aclImdb" / "test" / "pos"
test_neg = pathlib.Path.cwd() / "aclImdb" / "test" / "neg"

test_negative_texts = []
i = 0
for file_path in test_neg.glob('*.txt'):
    if i < 1000:
        with open(file_path, 'r') as f:
            test_negative_texts.append(f.read())
        i += 1
test_negative_labels = [0] * len(test_negative_texts)

test_positive_texts = []
i = 0
for file_path in test_pos.glob('*.txt'):
    if i < 1000:
        with open(file_path, 'r') as f:
            test_positive_texts.append(f.read())
        i += 1
test_positive_labels = [1] * len(test_positive_texts)

test_texts = test_positive_texts + test_negative_texts
test_labels = test_positive_labels + test_negative_labels

### Evaluate Accuracy

In [None]:
X_test = vectorizer.transform(test_texts)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Test Accuracy: %.2f%%" % sklearn.metrics.accuracy_score(test_labels, y_pred))

### Feature Importance

In [None]:
importance = model.coef_[0]

In [None]:
positive_words = dict()
for i, word in enumerate(vocab):
    positive_words[word] = importance[i]

In [None]:
cloud = wordcloud.WordCloud().generate_from_frequencies(positive_words)
plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
positive_words = dict()
for i, word in enumerate(vocab):
    positive_words[word] = -importance[i]

In [None]:
cloud = wordcloud.WordCloud().generate_from_frequencies(positive_words)
plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()