In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
stop_words = stopwords.words('english')

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import pandas as pd

# Membaca dataset dari imdb.txt
df = pd.read_csv('imdb.txt', delimiter='\t', header=0, quoting=3)

# Memberi nama kolom baru
df.columns = ['Review', 'Rating']

# Mengkategorikan dataset berdasarkan nilai Rating (0 atau 1)
df['Category'] = df['Rating'].apply(lambda x: 'Negative' if x == 0 else 'Positive')

# Menambahkan tabel sesuai kategori
negative_reviews = df[df['Category'] == 'Negative']
positive_reviews = df[df['Category'] == 'Positive']

# Menggabungkan kedua tabel menjadi satu
combined_df = pd.concat([negative_reviews, positive_reviews])

# Menampilkan beberapa baris pertama dari tabel gabungan
print(combined_df.head())


                                              Review  Rating  Category
0  Not sure who was more lost - the flat characte...       0  Negative
1  Attempting artiness with black & white and cle...       0  Negative
2       Very little music or anything to speak of.         0  Negative
4  The rest of the movie lacks art, charm, meanin...       0  Negative
5                                Wasted two hours.         0  Negative


In [4]:
print(df["Review"][0])

Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  


In [5]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup             

# Initialize the BeautifulSoup object on a single movie review     
example1 = BeautifulSoup(df["Review"][0])  

# Print the raw review and then the output of get_text(), for 
# comparison
print(df["Review"][0])
print(example1.get_text())

Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  
Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  


In [6]:
import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
print(letters_only)

Not sure who was more lost   the flat characters or the audience  nearly half of whom walked out   


In [7]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = [lemmatizer.lemmatize(word) for word in text]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [8]:
df['Review'] = df['Review'].apply(lambda x: clean_text(x))
 
df.head()

Unnamed: 0,Review,Rating,Category
0,"sure lost flat character audience, nearly half...",0,Negative
1,attempting artiness black white clever camera ...,0,Negative
2,little music anything speak,0,Negative
3,best scene movie gerardo trying find song keep...,1,Positive
4,"rest movie lack art, charm, meaning emptiness,...",0,Negative


In [9]:
sample_corpora = df['Review'].iloc[:2].values
sample_corpora

array(['sure lost flat character audience, nearly half walked',
       'attempting artiness black white clever camera angles, movie disappointed became even ridiculous acting poor plot line almost non existent'],
      dtype=object)

Using Word2Vec

In [10]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [11]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [12]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in df["Review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set


In [13]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2024-07-17 09:30:52,753 : INFO : collecting all words and their counts
2024-07-17 09:30:52,754 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-07-17 09:30:52,756 : INFO : collected 2793 word types from a corpus of 7447 raw words and 997 sentences
2024-07-17 09:30:52,756 : INFO : Creating a fresh vocabulary
2024-07-17 09:30:52,757 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 12 unique words (0.43% of original 2793, drops 2781)', 'datetime': '2024-07-17T09:30:52.757195', 'gensim': '4.3.0', 'python': '3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26120-SP0', 'event': 'prepare_vocab'}
2024-07-17 09:30:52,758 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 leaves 919 word corpus (12.34% of original 7447, drops 6528)', 'datetime': '2024-07-17T09:30:52.758197', 'gensim': '4.3.0', 'python': '3.11.5 | packaged by Anaconda, Inc. | (main, 

Training model...


In [14]:
from gensim.models import Word2Vec

model_name = "300features_40minwords_10context"
model = Word2Vec.load(model_name)

2024-07-17 09:30:52,813 : INFO : loading Word2Vec object from 300features_40minwords_10context
2024-07-17 09:30:52,821 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2024-07-17 09:30:52,822 : INFO : setting ignored attribute cum_table to None
2024-07-17 09:30:52,822 : INFO : Word2Vec lifecycle event {'fname': '300features_40minwords_10context', 'datetime': '2024-07-17T09:30:52.822644', 'gensim': '4.3.0', 'python': '3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26120-SP0', 'event': 'loaded'}


In [15]:
words = "man woman child kitchen".split()
available_words = [word for word in words if word in model.wv]
missing_words = [word for word in words if word not in model.wv]

print(f"Available words: {available_words}")
print(f"Missing words: {missing_words}")


Available words: []
Missing words: ['man', 'woman', 'child', 'kitchen']


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Your sample corpora
sample_corpora = ["This is the first document.",
                  "This document is the second document.",
                  "And this is the third one.",
                  "Is this the first document?"]

count_vectorizer = CountVectorizer()
wm = count_vectorizer.fit_transform(sample_corpora)

doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
feat_names = count_vectorizer.get_feature_names_out()

sample_df = pd.DataFrame(data=wm.toarray(), index=doc_names, columns=feat_names)
print(sample_df)


      and  document  first  is  one  second  the  third  this
Doc0    0         1      1   1    0       0    1      0     1
Doc1    0         2      0   1    0       1    1      0     1
Doc2    1         0      0   1    1       0    1      1     1
Doc3    0         1      1   1    0       0    1      0     1


In [17]:
X_train, X_test , y_train, y_test = train_test_split(df['Review'].values,df['Rating'].values,test_size=0.2,random_state=123,stratify=df['Rating'].values)

With new dataset (25,000 IMDB movie reviews)

In [18]:
# Read data from files 
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [19]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [20]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [21]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

# print("Parsing sentences from unlabeled set")
# for review in unlabeled_train["review"]:
#     sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  review_text = BeautifulSoup(review).get_text()
  review_text = BeautifulSoup(review).get_text()


In [22]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train["review"]).toarray()
y = train["sentiment"]  # Assuming the sentiment column has the labels

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

### KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier
import joblib

# Initialize the model
model_knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
model_knn.fit(X_train, y_train)

# Evaluate the model
y_pred_knn = model_knn.predict(X_test)
print(classification_report(y_test, y_pred_knn))

# Save the model
joblib.dump(model_knn, 'model_week5_knn.pkl')

              precision    recall  f1-score   support

           0       0.73      0.74      0.73      2500
           1       0.74      0.72      0.73      2500

    accuracy                           0.73      5000
   macro avg       0.73      0.73      0.73      5000
weighted avg       0.73      0.73      0.73      5000



['model_week5_knn.pkl']

### Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

# Initialize the model
model_nb = GaussianNB()

# Train the model
model_nb.fit(X_train, y_train)

# Evaluate the model
y_pred_nb = model_nb.predict(X_test)
print(classification_report(y_test, y_pred_nb))

# Save the model
joblib.dump(model_nb, 'model_week5_nb.pkl')

              precision    recall  f1-score   support

           0       0.79      0.82      0.80      2500
           1       0.81      0.78      0.80      2500

    accuracy                           0.80      5000
   macro avg       0.80      0.80      0.80      5000
weighted avg       0.80      0.80      0.80      5000



['model_week5_nb.pkl']

### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
model_dt = DecisionTreeClassifier()

# Train the model
model_dt.fit(X_train, y_train)

# Evaluate the model
y_pred_dt = model_dt.predict(X_test)
print(classification_report(y_test, y_pred_dt))

# Save the model
joblib.dump(model_dt, 'model_week5_dt.pkl')

              precision    recall  f1-score   support

           0       0.71      0.71      0.71      2500
           1       0.71      0.71      0.71      2500

    accuracy                           0.71      5000
   macro avg       0.71      0.71      0.71      5000
weighted avg       0.71      0.71      0.71      5000



['model_week5_dt.pkl']

### SVM

In [None]:
from sklearn.svm import SVC

# Initialize the model
model_svm = SVC()

# Train the model
model_svm.fit(X_train, y_train)

# Evaluate the model
y_pred_svm = model_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

# Save the model
joblib.dump(model_svm, 'model_week5_svm.pkl')

### NN (MLP Classifier)

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize the model
model_mlp = MLPClassifier(max_iter=300)

# Train the model
model_mlp.fit(X_train, y_train)

# Evaluate the model
y_pred_mlp = model_mlp.predict(X_test)
print(classification_report(y_test, y_pred_mlp))

# Save the model
joblib.dump(model_mlp, 'model_week5_mlp.pkl')