In [30]:
import nltk
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [25]:
# Import dataset and replace labels with 0 and 1 for classification
df = pd.read_csv('/IMDB Dataset.csv', encoding='Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [26]:
# Define stop_words and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [27]:
# Define strip_html function to handle text and file input
def strip_html(text):
    if isinstance(text, str):
        return BeautifulSoup(text, "html.parser").get_text()
    else:
        # If it's not a string, assume it's a file name and read its contents
        with open(text, 'r', encoding='utf-8') as file:
            return BeautifulSoup(file.read(), "html.parser").get_text()

In [28]:
# Define clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [31]:
# Create a new column for processed reviews
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))
# Display all the data after cleaning
pd.set_option('display.max_columns', None)  # Show all columns
# print(df)
df.head(20)

  return BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,review,sentiment,Processed_Reviews
0,One of the other reviewers has mentioned that ...,1,one reviewer ha mention watch 1 oz episode hoo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production film technique una...
2,I thought this was a wonderful way to spend ti...,1,think wa wonderful way spend time hot summer w...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stun fi...
5,"Probably my all-time favorite movie, a story o...",1,probably time favorite movie story selflessnes...
6,I sure would like to see a resurrection of a u...,1,sure would like see resurrection date seahunt ...
7,"This show was an amazing, fresh & innovative i...",0,show wa amaze fresh innovative idea 70 first a...
8,Encouraged by the positive comments about this...,0,encourage positive comment film wa look forwar...
9,If you like original gut wrenching laughter yo...,1,like original gut wrench laughter like movie y...


In [32]:
#Defining input and target variable
x = df['Processed_Reviews']
y = df['sentiment']

In [33]:
#Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [34]:
#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer().fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

In [35]:
#instantiate the model (using the default parameters)
SVM = SVC()

In [43]:
# fit the model with pre-processed data
SVM.fit(bow_train, y_train)

In [38]:
#perform classification and prediction on samples in tf_test
predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      5035
           1       0.86      0.89      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [39]:
#Creating a Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('SVM', SVC())
])

In [40]:
#Defining hyperparameters
parameters = {
    'vect__max_df':[0.1,0.2,0.3,0.4,0.5,0.6,0.7],
    'vect__ngram_range':  [(1,1), (1,2), (1,3)],
    'SVM__kernel': ['poly', 'rbf', 'sigmoid'],
    'SVM__C': [50, 10, 1.0, 0.1, 0.01]}

In [41]:
# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(pipeline, param_grid=parameters, refit = True, verbose = 3, cv=5)
grid_result = grid_search.fit(df.loc[:100, 'Processed_Reviews'].values.astype('U'), df.loc[:100, 'sentiment'].values.astype('U'))

Fitting 5 folds for each of 315 candidates, totalling 1575 fits
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.571 total time=   0.0s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.550 total time=   0.0s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.550 total time=   0.0s
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.600 total time=   0.0s
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.600 total time=   0.0s
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.571 total time=   0.1s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.550 total time=   0.1s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.550 total time=

In [42]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.803333 using {'SVM__C': 50, 'SVM__kernel': 'sigmoid', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 1)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 2)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 3)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 1)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 2)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 3)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect__ngram_range': (1, 1)}
0.574286 (0.022406) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect