### Need to perform data analysis on the dataset, check review lengths etc.

In [56]:
import os

def load_movie_reviews(folder):
    reviews = []
    for filename in os.listdir(folder):
        filepath = f'{folder}/{filename}'
        with open(filepath, 'r', encoding='utf-8') as file:
            review = file.read()
            reviews.append(review)
    return reviews

def remove_breaks(reviews):
    return [review.replace('<br /><br />', '') for review in reviews]

positive_reviews = load_movie_reviews('data/train/pos')
negative_reviews = load_movie_reviews('data/train/neg')

positive_reviews = remove_breaks(positive_reviews)
negative_reviews = remove_breaks(negative_reviews)

# Only use the first 3000 reviews from both
positive_reviews = positive_reviews
negative_reviews = negative_reviews

print(len(positive_reviews))
print(len(negative_reviews))

print(positive_reviews[1])

12500
12500
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without the luxuries; 

## Use spacy to Tokenize the reviews

In [57]:
import spacy
from tqdm.notebook import tqdm

def tokenize_reviews(reviews):
    nlp = spacy.load('en_core_web_lg') # , disable=['parser', 'tagger', 'ner']
    tokenized_reviews = []
    word_embeddings = []
    
    # Use nlp.pipe for efficient batch processing
    for doc in tqdm(nlp.pipe(reviews), total=len(reviews)):
        tokens = [token.text for token in doc if not token.is_stop]
        tokenized_reviews.append(tokens)
        word_embedding = [token.vector for token in doc if not token.is_stop]
        word_embeddings.append(word_embedding)
        
        
    return tokenized_reviews, word_embeddings

positive_tokenized, positive_vectors = tokenize_reviews(positive_reviews)
negative_tokenized, negative_vectors = tokenize_reviews(negative_reviews)

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [58]:
print(positive_tokenized[1])

['Homelessness', '(', 'Houselessness', 'George', 'Carlin', 'stated', ')', 'issue', 'years', 'plan', 'help', 'street', 'considered', 'human', 'going', 'school', ',', 'work', ',', 'vote', 'matter', '.', 'people', 'think', 'homeless', 'lost', 'cause', 'worrying', 'things', 'racism', ',', 'war', 'Iraq', ',', 'pressuring', 'kids', 'succeed', ',', 'technology', ',', 'elections', ',', 'inflation', ',', 'worrying', 'end', 'streets', '.', 'given', 'bet', 'live', 'streets', 'month', 'luxuries', 'home', ',', 'entertainment', 'sets', ',', 'bathroom', ',', 'pictures', 'wall', ',', 'computer', ',', 'treasure', 'like', 'homeless', '?', 'Goddard', 'Bolt', 'lesson', '.', 'Mel', 'Brooks', '(', 'directs', ')', 'stars', 'Bolt', 'plays', 'rich', 'man', 'world', 'deciding', 'bet', 'sissy', 'rival', '(', 'Jeffery', 'Tambor', ')', 'live', 'streets', 'thirty', 'days', 'luxuries', ';', 'Bolt', 'succeeds', ',', 'wants', 'future', 'project', 'making', 'buildings', '.', 'bet', 'Bolt', 'thrown', 'street', 'bracelet

### Find some statistics about the reviews

In [59]:
from collections import Counter
import numpy as np

def get_review_stats(tokenized_reviews):
    review_lengths = [len(review) for review in tokenized_reviews]
    print('Min review length:', min(review_lengths))
    print('Max review length:', max(review_lengths))
    print('Mean review length:', sum(review_lengths) / len(review_lengths))
    print('Median review length:', sorted(review_lengths)[len(review_lengths) // 2])
    print('Standard deviation:', np.std(review_lengths))
    print()

# Find the most common words
def get_most_common_words(tokenized_reviews):
    all_words = [word for review in tokenized_reviews for word in review]
    word_counts = Counter(all_words)
    print('Number of unique words:', len(word_counts))
    print('Most common words:', word_counts.most_common(30))
    print()

get_review_stats(positive_tokenized)
get_review_stats(negative_tokenized)


get_most_common_words(positive_tokenized)
get_most_common_words(negative_tokenized)



Min review length: 7
Max review length: 1556
Mean review length: 141.39816
Median review length: 103
Standard deviation: 111.08520310380857

Min review length: 5
Max review length: 927
Mean review length: 136.0208
Median review length: 102
Standard deviation: 101.02098260935695

Number of unique words: 76007
Most common words: [(',', 143760), ('.', 135608), ('"', 30418), ('-', 25917), ('film', 20571), ('movie', 18714), ('(', 17382), (')', 16970), ('!', 10643), ("'", 8698), ('like', 8585), ('good', 7317), ('story', 6497), ('time', 6322), ('great', 5891), (':', 4420), ('?', 4351), ('people', 4224), ('love', 3961), ('...', 3942), ('way', 3933), ('life', 3932), ('best', 3897), ('films', 3727), ('think', 3582), (';', 3516), ('characters', 3495), ('character', 3476), ('movies', 3470), ('seen', 3369)]

Number of unique words: 75658
Most common words: [('.', 135849), (',', 131620), ('"', 33000), ('-', 27692), ('movie', 24409), ('film', 18925), ('(', 15737), (')', 15638), ('!', 11367), ('like',

### Generate word embeddings

In [60]:
print(len(positive_vectors[0]))
print(len(positive_tokenized[0]))

89
89
