In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=23419f7e69e4e7799843d286ddf242a8aca5f8a29c9bfab7884d7e363560f6ce
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


# Data Extraction

In [21]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("IMDb Reviews").getOrCreate()

# Load the IMDb movie reviews dataset into a PySpark DataFrame
imdb_df = spark.read \
              .option("header", True) \
              .option("inferSchema", True) \
              .option("quote", "\"") \
              .option("escape", "\"") \
              .csv("/content/drive/MyDrive/IMDB Dataset.csv")
imdb_df.show()

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
|Basically there's...| negative|
|Petter Mattei's "...| positive|
|Probably my all-t...| positive|
|I sure would like...| positive|
|This show was an ...| negative|
|Encouraged by the...| negative|
|If you like origi...| positive|
|Phil the Alien is...| negative|
|I saw this movie ...| negative|
|So im not a big f...| negative|
|The cast played S...| negative|
|This a fantastic ...| positive|
|Kind of drawn in ...| negative|
|Some films just s...| positive|
|This movie made i...| negative|
|I remember this f...| positive|
|An awful film! It...| negative|
+--------------------+---------+
only showing top 20 rows



## Description of classes in the sentiment column

In [22]:
imdb_df.select('sentiment').distinct().rdd.map(lambda r: r[0]).collect()

['positive', 'negative']

## Number of reviews in each class

In [23]:
imdb_df.groupBy('sentiment').count().orderBy('count').show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|25000|
| negative|25000|
+---------+-----+



# Data Preprocessing

## Data Cleaning

In [24]:
from pyspark.sql.functions import lower, regexp_replace
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import CountVectorizer

# Remove HTML tags, punctuation, and special characters from the text
imdb_df = imdb_df.withColumn("preprocessed_review", regexp_replace(imdb_df["review"], '<.*?>', ''))
imdb_df = imdb_df.withColumn("preprocessed_review", regexp_replace(imdb_df["preprocessed_review"], '[^\w\s]+', ''))
imdb_df = imdb_df.withColumn("preprocessed_review", regexp_replace(imdb_df["preprocessed_review"], '[^A-Za-z0-9 ]+', ''))

In [25]:
imdb_df.show()

+--------------------+---------+--------------------+
|              review|sentiment| preprocessed_review|
+--------------------+---------+--------------------+
|One of the other ...| positive|One of the other ...|
|A wonderful littl...| positive|A wonderful littl...|
|I thought this wa...| positive|I thought this wa...|
|Basically there's...| negative|Basically theres ...|
|Petter Mattei's "...| positive|Petter Matteis Lo...|
|Probably my all-t...| positive|Probably my allti...|
|I sure would like...| positive|I sure would like...|
|This show was an ...| negative|This show was an ...|
|Encouraged by the...| negative|Encouraged by the...|
|If you like origi...| positive|If you like origi...|
|Phil the Alien is...| negative|Phil the Alien is...|
|I saw this movie ...| negative|I saw this movie ...|
|So im not a big f...| negative|So im not a big f...|
|The cast played S...| negative|The cast played S...|
|This a fantastic ...| positive|This a fantastic ...|
|Kind of drawn in ...| negat

## Data Transformation

### Conversion of text to lowercase

In [26]:
# Convert all text to lowercase
imdb_df = imdb_df.withColumn("preprocessed_review", lower(imdb_df["preprocessed_review"]))

### Toeknization of text into words

In [27]:
# Tokenize the text into words
tokenizer = RegexTokenizer(inputCol="preprocessed_review", outputCol="words", pattern="\\W")
words_df = tokenizer.transform(imdb_df)

### Removal of stop words from the text

In [28]:
# Remove stop words from the text
stop_words = StopWordsRemover(inputCol="words", outputCol="text_no_stop_words").transform(words_df)

### Conversion of words into numeric vectors

In [29]:
# Convert the words into a numeric vector
vectorizer = CountVectorizer(inputCol="words", outputCol="features")
features_df = vectorizer.fit(words_df).transform(words_df)


In [30]:
features_df.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|              review|sentiment| preprocessed_review|               words|            features|
+--------------------+---------+--------------------+--------------------+--------------------+
|One of the other ...| positive|one of the other ...|[one, of, the, ot...|(221897,[0,1,2,3,...|
|A wonderful littl...| positive|a wonderful littl...|[a, wonderful, li...|(221897,[0,1,2,3,...|
|I thought this wa...| positive|i thought this wa...|[i, thought, this...|(221897,[0,1,2,3,...|
|Basically there's...| negative|basically theres ...|[basically, there...|(221897,[0,1,2,3,...|
|Petter Mattei's "...| positive|petter matteis lo...|[petter, matteis,...|(221897,[0,1,2,3,...|
|Probably my all-t...| positive|probably my allti...|[probably, my, al...|(221897,[0,1,2,3,...|
|I sure would like...| positive|i sure would like...|[i, sure, would, ...|(221897,[0,1,2,3,...|
|This show was an ...| negative|this sho

# Data Splitting into training and test set

In [32]:
# Split the dataset into training and test sets
train_df, test_df = features_df.randomSplit([0.8, 0.2], seed=42)

## Training Dataset

In [34]:
train_df.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|              review|sentiment| preprocessed_review|               words|            features|
+--------------------+---------+--------------------+--------------------+--------------------+
|\b\b\b\bA Turkish...| positive|a turkish bath se...|[a, turkish, bath...|(221897,[0,1,2,3,...|
|!!!! MILD SPOILER...| negative| mild spoilers th...|[mild, spoilers, ...|(221897,[0,1,2,3,...|
|!!!! POSSIBLE MIL...| negative| possible mild sp...|[possible, mild, ...|(221897,[0,1,2,3,...|
|" Now in India's ...| positive| now in indias su...|[now, in, indias,...|(221897,[0,1,2,3,...|
|" Så som i himmel...| positive| s som i himmelen...|[s, som, i, himme...|(221897,[0,1,2,3,...|
|"... the beat is ...| positive| the beat is too ...|[the, beat, is, t...|(221897,[0,1,2,3,...|
|"200l: A Space Od...| positive|200l a space odys...|[200l, a, space, ...|(221897,[0,1,2,3,...|
|"8 SIMPLE RULES.....| positive|8 simple

## Testing Dataset

In [35]:
test_df.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|              review|sentiment| preprocessed_review|               words|            features|
+--------------------+---------+--------------------+--------------------+--------------------+
|!!!! MILD SPOILER...| negative| mild spoilers wi...|[mild, spoilers, ...|(221897,[0,1,2,3,...|
|" While sporadica...| negative| while sporadical...|[while, sporadica...|(221897,[0,1,2,3,...|
|"2001: A Space Od...| positive|2001 a space odys...|[2001, a, space, ...|(221897,[0,1,2,3,...|
|"A Gentleman's Ga...| negative|a gentlemans game...|[a, gentlemans, g...|(221897,[0,1,2,3,...|
|"A Thief in the N...| positive|a thief in the ni...|[a, thief, in, th...|(221897,[0,1,2,3,...|
|"A research scien...| negative|a research scient...|[a, research, sci...|(221897,[0,1,2,3,...|
|"A young woman un...| negative|a young woman unw...|[a, young, woman,...|(221897,[0,1,2,3,...|
|"All men are guil...| positive|all men 