In [None]:
!pip install pyprind

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [None]:
import os
import pandas as pd
import numpy as np
import pyprind
import pprint
from sklearn.datasets import make_classification
import pickle
import os
import re
import sqlite3
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
if '/content/drive' not in os.listdir('/content'):
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
basepath = '/content/drive/My Drive/SentimentClassification/aclImdb'

In [None]:
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)  # Adjust this number if necessary based on the number of reviews
df_list = []  # List to store data temporarily

# Process the dataset
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
                df_list.append([txt, labels[l]])
                pbar.update()

df = pd.DataFrame(df_list, columns=['review', 'sentiment'])

print(df.head())

0% [##                            ] 100% | ETA: 02:05:23

                                              review  sentiment
0  Once again Mr. Costner has dragged out a movie...          0
1  First of all, I would like to say that I am a ...          0
2  I'm a huge fan of both Emily Watson (Breaking ...          0
3  I was pulled into this movie early on, much to...          0
4  This tale of the upper-classes getting their c...          0


In [None]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I saw this on sale - NEW - at my local store f...,0
1,"and it's only January, still I'm sure of it!<b...",0
2,This movie is so unreal. French movies like th...,0


In [None]:
df.shape
(50000, 2)

(50000, 2)

In [None]:
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'
])

count = CountVectorizer()

bag = count.fit_transform(docs)

pp = pprint.PrettyPrinter(indent=4) # for better formatting

print("Vocabulary:")
pp.pprint(count.vocabulary_)

print("Feature vectors:")
print(bag.toarray())


Vocabulary:
{   'and': 0,
    'is': 1,
    'one': 2,
    'shining': 3,
    'sun': 4,
    'sweet': 5,
    'the': 6,
    'two': 7,
    'weather': 8}
Feature vectors:
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [None]:
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [None]:
 df.loc[0, 'review'][-50:]
'is seven.<br /><br />Title (Brazil): Not Available'

'is seven.<br /><br />Title (Brazil): Not Available'

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

    # Remove all non-word characters and convert text to lowercase, then append emoticons to end
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')

    return text

# Apply the preprocessing function to all reviews in the DataFrame
df['review'] = df['review'].apply(preprocessor)

# Example of how to test the function on the last 50 characters of the first document
print("Processed last 50 characters of the first review:")
print(preprocessor(df.loc[0, 'review'][-50:]))

# Directly testing the preprocessor with sample text
print("Processed example text:")
print(preprocessor("</a>This :) is :( a test :-)!"))

Processed last 50 characters of the first review:
ironheart is so bad it ain t even funny it s sad  
Processed example text:
this is a test  :) :( :)


In [None]:
!pip install nltk



In [None]:
nltk.download('stopwords')

# Tokenizer and Stemmer
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

# Stop-words
stop = stopwords.words('english')

# Example usage of the tokenizer and stemmer
text = 'runners like running and thus they run'
print("Tokenized:", tokenizer(text))
print("Stemmed:", tokenizer_porter(text))

# Example for removing stop-words
stemmed_tokens = tokenizer_porter(text)
filtered_tokens = [word for word in stemmed_tokens if word not in stop]
print("Filtered (stop-words removed):", filtered_tokens)

# Applying these preprocessing steps to the DataFrame
# Assuming 'df' is your DataFrame and 'review' is the column containing text data
df['review'] = df['review'].apply(lambda x: ' '.join(tokenizer_porter(x)))
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Tokenized: ['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
Stemmed: ['runner', 'like', 'run', 'and', 'thu', 'they', 'run']
Filtered (stop-words removed): ['runner', 'like', 'run', 'thu', 'run']


In [None]:
stop = stopwords.words('english')

# Data division
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

# Define a TfidfVectorizer with potential parameters for GridSearch
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# Parameter grid for GridSearch
param_grid = [
    {'vect__ngram_range': [(1, 1)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_porter],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': [1.0, 10.0, 100.0]},
    {'vect__ngram_range': [(1, 1)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_porter],
     'vect__use_idf': [False],
     'vect__norm': [None],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': [1.0, 10.0, 100.0]}
]

# Pipeline setup with TfidfVectorizer and LogisticRegression
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(random_state=0, solver='liblinear'))
])

# GridSearchCV setup
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=2,
                           n_jobs=-1)  # use n_jobs=-1 to use all cores

# Fitting the model
gs_lr_tfidf.fit(X_train, y_train)

# Best parameter set
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

# Checking performance on the training and test set
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


ValueError: 
All the 240 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1216, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 1181, in _fit_liblinear
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0


In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')

# Define the tokenizer function with stop-word removal
stop_words = set(nltk.corpus.stopwords.words('english'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop_words]
    return tokenized

# Feature Extraction: Transform the text data into a bag-of-words model
count = CountVectorizer(stop_words='english', tokenizer=tokenizer, max_df=0.1, max_features=5000)
X = count.fit_transform(df['review'].values)

# LDA Model
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics = lda.fit_transform(X)

# Display the top words for each topic
n_top_words = 5
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

# To print out example reviews from one topic category
def print_reviews_for_topic(topic_idx, n_reviews):
    # Select reviews indices for the specified topic
    reviews_idx = np.argsort(X_topics[:, topic_idx])[::-1]
    for i in reviews_idx[:n_reviews]:
        print(f"\nReview #{i}:")
        print(df.iloc[i, 0][:300])  # print the first 300 characters of the review

# Example: print three reviews from the topic that relates to horror movies (assuming it's topic #6)
print_reviews_for_topic(6 - 1, 3)



Topic 1:
book read completely train heart
Topic 2:
series space vampire dracula earth
Topic 3:
jack killer book scarlett high
Topic 4:
stupid american main shows children
Topic 5:
comedy jokes series original family
Topic 6:
stupid girl snakes woman girls
Topic 7:
performance action production version audience
Topic 8:
young house girl goes woman
Topic 9:
effects special budget terrible ed
Topic 10:
alien action waste horrible predator

Review #79:
Through its 2-hour running length, Crash charts the emotional anguish of its 10-odd ensemble of characters when faced with the sometimes blatant and sometimes latent forms of racism underlying in American society. That and the emotional anguish of one of its audiences sitting near the front and desp

Review #2600:
Ill-conceived sequel(..the absurd idea of having the killer snowman on the rampage at an island resort where there is no snow or cold weather)brings back the spirit of the psychopath, returning thanks to a scientific experiment pro

In [None]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
  print('\nHorror movie #%d:' % (iter_idx + 1))
  print(df['review'][movie_idx][:300], '...')


Horror movie #1:
Through its 2-hour running length, Crash charts the emotional anguish of its 10-odd ensemble of characters when faced with the sometimes blatant and sometimes latent forms of racism underlying in American society. That and the emotional anguish of one of its audiences sitting near the front and desp ...

Horror movie #2:
Ill-conceived sequel(..the absurd idea of having the killer snowman on the rampage at an island resort where there is no snow or cold weather)brings back the spirit of the psychopath, returning thanks to a scientific experiment providing foreign elements which reintroduce life to his molecules(..it' ...

Horror movie #3:
Ill-conceived sequel(..the absurd idea of having the killer snowman on the rampage at an island resort where there is no snow or cold weather)brings back the spirit of the psychopath, returning thanks to a scientific experiment providing foreign elements which reintroduce life to his molecules(..it' ...
