In [1]:
import numpy as np
import pandas as pd  

In [2]:
import string 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daara\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daara\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
# Load the dataset
df1 = pd.read_csv("IMDB Dataset.csv")

# Select the first 500 rows for simplicity
df = df1.iloc[:500]
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
495,"""American Nightmare"" is officially tied, in my...",negative
496,"First off, I have to say that I loved the book...",negative
497,This movie was extremely boring. I only laughe...,negative
498,I was disgusted by this movie. No it wasn't be...,negative


In [6]:

# Preprocessing function
def preprocess_text(text):
    # Conversion to Lower case
    text = text.lower()

    # Removal of punctuation marks from the text
    text = ''.join([char for char in text if char not in string.punctuation])

    # Removal of numeric numbers from the text
    text = ''.join([char for char in text if not char.isdigit()])

    # Conversion to Word tokenize
    tokens = word_tokenize(text)

    # Removal of stopwords from the text
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Applying stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    final_text = ' '.join(stemmed_tokens)
    
    return final_text

# Apply preprocessing to the 'review' column
df['review'] = df['review'].apply(lambda x: preprocess_text(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: preprocess_text(x))


In [7]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [8]:

# Train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)


In [9]:

# Evaluate the model
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.91      0.81        46
    positive       0.90      0.70      0.79        54

    accuracy                           0.80       100
   macro avg       0.81      0.81      0.80       100
weighted avg       0.82      0.80      0.80       100



In [10]:

# User Input Handling and Prediction
def predict_sentiment(user_input):
    preprocessed_input = preprocess_text(user_input)
    input_vector = tfidf_vectorizer.transform([preprocessed_input])
    prediction = svm_model.predict(input_vector)
    return prediction[0]


In [15]:

# Ask for user input and predict sentiment
user_text = input("Enter your text: ")
sentiment = predict_sentiment(user_text)
print("Predicted sentiment:", sentiment)


Predicted sentiment: positive
