In [1]:
import os
os.environ['KAGGLE_USERNAME'] =
os.environ['KAGGLE_KEY'] =
!kaggle datasets download -d vishakhdapat/imdb-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/vishakhdapat/imdb-movie-reviews
License(s): MIT


In [2]:
!unzip /content/imdb-movie-reviews.zip

Archive:  /content/imdb-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [3]:
import pandas as pd
df = pd.read_csv('/content/IMDB Dataset.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

##Check null values

In [6]:
df.isnull().sum()
df.isna().sum()

Unnamed: 0,0
review,0
sentiment,0


In [7]:
df = df.fillna('')

In [8]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


##Preprocessing of text

In [9]:
import re
import string
import nltk
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def preprocess_text(txt):
    # Lowercase
    txt = txt.lower()
    # Remove HTML tags (if any)
    txt = re.sub(r"<.*?>", " ", txt)
    # Remove URLs (if any)
    txt = re.sub(r"https?://\S+|www\.\S+", " ", txt)
    # Remove Punctuation
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers (optional, if you want to keep numbers, you can skip this step)
    txt = re.sub(r'[^A-Za-z\s]', '', txt)
    # Tokenize text
    words = word_tokenize(txt)
    # Remove words less than two letters
    words = [word for word in words if len(word) > 2]
    # Remove stopwords
    words = [word for word in words if word not in STOPWORDS]
    # Lemmatize
    lmt = WordNetLemmatizer()
    words = [lmt.lemmatize(word) for word in words]
    return " ".join(words)


# Sample text (your input text)
text = '''Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details'''

# Preprocess the text
cleaned_text = preprocess_text(text)

# Print the cleaned text
print("Cleaned Text:", cleaned_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Cleaned Text: skill programming language python panda numpy scipy scikitlearn matplotlib sql java javascriptjquery machine learning regression svm nave bayes knn random forest decision tree boosting technique cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural net database visualization mysql sqlserver cassandra hbase elasticsearch dj dcjs plotly kibana matplotlib ggplot tableau others regular expression html cs angular logstash kafka python flask git docker computer vision open understanding deep learningeducation detail


In [10]:
df['review'] = df['review'].apply(preprocess_text)

In [11]:
df['review'][0]

'one reviewer mentioned watching episode youll hooked right exactly happened first thing struck brutality unflinching scene violence set right word trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word called nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate turned prison bitch due lack street skill pr

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['sentiment'])
df['sentiment'] = le.transform(df['sentiment'])

In [13]:
df['sentiment'].unique()

array([1, 0])

In [28]:
df['sentiment'][0]

np.int64(1)

## Vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

tfidf.fit(df['review'])
requiredText  = tfidf.transform(df['review'])

## Model Training

In [15]:
X = requiredText[:5000]
Y = df['sentiment'][:5000]

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, stratify=Y, random_state=1)
# Ensure that X_train and X_test are dense if they are sparse
X_train = X_train.toarray() if hasattr(X_train, 'toarray') else X_train
X_test = X_test.toarray() if hasattr(X_test, 'toarray') else X_test
model = LogisticRegression()
model.fit(X_train, Y_train)

#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(f'Accuracy on train data : {training_data_accuracy:.4f}')

#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(f'Accuracy on test data : {test_data_accuracy:.4f}')

Accuracy on train data : 0.9602
Accuracy on test data : 0.8660


In [18]:
import pickle
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(tfidf,open('tfidf.pkl','wb'))

In [19]:
import pickle

model = pickle.load(open('model.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [35]:
text = "one cerebr insight movi ever seen script languag costum sceneri plot charact etc suprem bore watch intens even listen movi work mani time lost count scarlett age grace acquir wisdom beyond year end movi take tara ireland famili origin see result anoth civil war play time irish english movi depict doubl standard men time man still respect girlfriend whore woman mere seen privat place man societi accus improprieti cours scarlett alway think outsid box break rule necessari creat need chang help peopl surviv scarlett beauti definit bone strength self esteem wisdom grow age rememb line confront much plate tomorrow anoth day way carri determin courag learn experi good movi show daughter teach woman import respect men especi handsom one two side may treat woman differ depend act respect attract woman need learn movi teach movi like therapi better gone wind second part take place ireland anyon irish decent cherish sceneri peopl scarlett charact within"

text = preprocess_text(text)
text  = tfidf.transform([text])
text = text.toarray() if hasattr(text, 'toarray') else text
pred = model.predict(text)

if pred == [1]:
    print("positive comment")
else:
    print("negative comment")

positive comment
