In [2]:
import zipfile
import pandas as pd
import os

# File path
zip_file_path = 'word2vec-nlp-tutorial.zip'
extract_folder_path = 'word2vec-nlp-tutorial'

# Extracting the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder_path)

# Listing extracted files
extracted_files = os.listdir(extract_folder_path)
extracted_files

['labeledTrainData.tsv.zip',
 'sampleSubmission.csv',
 'testData.tsv.zip',
 'unlabeledTrainData.tsv.zip']

In [3]:
# It seems like the relevant file is 'labeledTrainData.tsv'
# Let's load this data and take a look at it

data_file_path = os.path.join(extract_folder_path, 'labeledTrainData.tsv.zip')

# Load the data into a DataFrame
movie_reviews = pd.read_csv(data_file_path, delimiter='\t')

# Display the first few rows of the DataFrame
movie_reviews.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


The movie review data has been successfully loaded into a DataFrame. It consists of three columns: 'id', 'sentiment', and 'review'. The 'sentiment' column indicates whether a review is positive (1) or negative (0).

In [4]:
# Counting the number of positive and negative reviews
review_counts = movie_reviews['sentiment'].value_counts()
positive_reviews = review_counts.get(1, 0)
negative_reviews = review_counts.get(0, 0)

positive_reviews, negative_reviews

(12500, 12500)

In [5]:
from textblob import TextBlob

# Function to classify sentiment using TextBlob
def classify_review_sentiment(review):
    analysis = TextBlob(review)
    # Assuming polarity >= 0 is positive, and < 0 is negative
    return 1 if analysis.sentiment.polarity >= 0 else 0

# Applying the function to each review
movie_reviews['predicted_sentiment'] = movie_reviews['review'].apply(classify_review_sentiment)

# Checking the first few rows to see the predicted sentiment
movie_reviews.head()

Unnamed: 0,id,sentiment,review,predicted_sentiment
0,5814_8,1,With all this stuff going down at the moment w...,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,0,It must be assumed that those who praised this...,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0


In [9]:
from sklearn.metrics import accuracy_score

# Actual sentiments from the dataset
actual_sentiments = movie_reviews['sentiment']

# Predicted sentiments using TextBlob
predicted_sentiments = movie_reviews['predicted_sentiment']

# Calculating the accuracy
accuracy = accuracy_score(actual_sentiments, predicted_sentiments)
print("Accuracy of TextBlob Sentiment Classifier:", accuracy)

Accuracy of TextBlob Sentiment Classifier: 0.68524


The accuracy of the sentiment analysis model using TextBlob is approximately 68.52%. This is significantly better than random guessing, which would have an expected accuracy of about 50% for a balanced dataset like this one (with an equal number of positive and negative reviews).

Next, we can use another prebuilt text sentiment analyzer, such as VADER (Valence Aware Dictionary and sEntiment Reasoner), to compare its performance with TextBlob. Let's proceed with applying VADER to the movie reviews.

In [6]:
pip install vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [10]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score  # Importing accuracy_score

# Initializing VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to classify sentiment using VADER
def classify_review_sentiment_vader(review):
    vs = analyzer.polarity_scores(review)
    # Assuming compound score >= 0 is positive, and < 0 is negative
    return 1 if vs['compound'] >= 0 else 0

# Applying the function to each review
movie_reviews['predicted_sentiment_vader'] = movie_reviews['review'].apply(classify_review_sentiment_vader)

# Calculating the accuracy of the VADER sentiment analysis
accuracy_vader = accuracy_score(movie_reviews['sentiment'], movie_reviews['predicted_sentiment_vader'])
print("Accuracy of VADER Sentiment Classifier:", accuracy_vader)

Accuracy of VADER Sentiment Classifier: 0.69404


## Part 2: Prepping Text for a Custom Model

In [14]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load the data into a DataFrame (replace the file path with the correct one on your system)
data_file_path = 'labeledTrainData.tsv'
movie_reviews = pd.read_csv(data_file_path, delimiter='\t')

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing steps
# 1. Convert text to lowercase
movie_reviews['review_cleaned'] = movie_reviews['review'].str.lower()

# 2. Remove punctuation and special characters
movie_reviews['review_cleaned'] = movie_reviews['review_cleaned'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

# 3. Remove stop words
stop_words = set(stopwords.words('english'))
movie_reviews['review_cleaned'] = movie_reviews['review_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# 4. Apply NLTK’s PorterStemmer
stemmer = PorterStemmer()
movie_reviews['review_cleaned'] = movie_reviews['review_cleaned'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# 5. Create a bag-of-words matrix
vectorizer_bow = CountVectorizer()
bow_matrix = vectorizer_bow.fit_transform(movie_reviews['review_cleaned'])

# 6. Create a tf-idf matrix
vectorizer_tfidf = TfidfVectorizer()
tfidf_matrix = vectorizer_tfidf.fit_transform(movie_reviews['review_cleaned'])

# Display the dimensions of the matrices
print('Bag-of-Words Matrix Shape:', bow_matrix.shape)
print('TF-IDF Matrix Shape:', tfidf_matrix.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Bag-of-Words Matrix Shape: (25000, 89468)
TF-IDF Matrix Shape: (25000, 89468)
