In [None]:
!pip install pyspark

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import string
from pyspark.sql.functions import desc


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=e70f4315ebdcad4c89c88d78cb281a6dffdac89353cc08d15b1d4854b271d297
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:

# Initialize Spark
conf = SparkConf().setAppName("BasicWordCount")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [None]:

# Define stopwords
STOPWORDS = set(["the", "a", "and", "of", "in", "to", "is", "it", "for", "with", "on", "at", "by", "an"])

In [None]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

In [None]:

# Function to process the file and count words
def process_file(file_path):
    # Read the text file into an RDD
    rdd = sc.textFile(file_path)

    # Process the text
    words = (rdd.flatMap(lambda line: clean_text(line).split())  # Split lines into words
             .filter(lambda word: word not in STOPWORDS and len(word) > 0)  # Remove stopwords and empty words
             .map(lambda word: (word, 1))  # Map words to (word, 1)
             .reduceByKey(lambda a, b: a + b)  # Reduce by key to count occurrences
             .sortBy(lambda x: -x[1]))  # Sort by count in descending order

    # Convert RDD to DataFrame
    words_df = spark.createDataFrame(words, ["word", "count"])
    words_df = words_df.sort(desc("count"))

    # Show top 25 results
    words_df.show(25, truncate=False)

    # Stop Spark session
    spark.stop()

if __name__ == "__main__":
    file_path = "/content/shakespeare.txt"
    process_file(file_path)

+-----+-----+
|word |count|
+-----+-----+
|my   |393  |
|i    |344  |
|that |322  |
|thy  |287  |
|thou |235  |
|not  |167  |
|me   |164  |
|but  |163  |
|love |162  |
|thee |162  |
|so   |145  |
|be   |142  |
|as   |121  |
|all  |117  |
|you  |112  |
|which|108  |
|his  |107  |
|when |106  |
|this |105  |
|your |100  |
|doth |88   |
|do   |84   |
|from |82   |
|no   |79   |
|or   |79   |
+-----+-----+
only showing top 25 rows



In [None]:
"""# Extended Word Count"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
import string

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ExtendedWordCount") \
    .getOrCreate()

sc = spark.sparkContext  # Get SparkContext from SparkSession

# Define stopwords
STOPWORDS = set(["the", "a", "and", "of", "in", "to", "is", "it", "for", "with", "on", "at", "by", "an"])

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Function to process multiple files and count words
def process_files(file_paths):
    # Create an RDD for all files
    rdd = sc.textFile(",".join(file_paths))

    # Clean and process the text
    words = (rdd.flatMap(lambda line: clean_text(line).split())  # Split lines into words
             .filter(lambda word: word not in STOPWORDS and len(word) > 0)  # Remove stopwords and empty words
             .map(lambda word: (word, 1))  # Map words to (word, 1)
             .reduceByKey(lambda a, b: a + b)  # Reduce by key to count occurrences
             .sortBy(lambda x: -x[1]))  # Sort by count in descending order

    # Convert RDD to DataFrame
    words_df = spark.createDataFrame(words, ["word", "count"])
    words_df = words_df.sort(desc("count"))

    # Show top 25 results
    words_df.show(25, truncate=False)

if __name__ == "__main__":
    file_paths = ["/content/text1.txt", "/content/text2.txt"]  # List of file paths
    process_files(file_paths)



+------------+-----+
|word        |count|
+------------+-----+
|intelligence|3    |
|machines    |3    |
|artificial  |2    |
|these       |2    |
|machine     |2    |
|learning    |2    |
|models      |2    |
|human       |1    |
|are         |1    |
|programmed  |1    |
|think       |1    |
|like        |1    |
|ai          |1    |
|science     |1    |
|aiming      |1    |
|smart       |1    |
|become      |1    |
|essential   |1    |
|technology  |1    |
|involves    |1    |
|use         |1    |
|algorithms  |1    |
|statistical |1    |
|allow       |1    |
|computers   |1    |
+------------+-----+
only showing top 25 rows



In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import string
from pyspark.sql.functions import desc

# Initialize Spark
conf = SparkConf().setAppName("ExtendedWordCount")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Define stopwords
STOPWORDS = set(["the", "a", "and", "of", "in", "to", "is", "it", "for", "with", "on", "at", "by", "an"])

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Function to process multiple files and count words
def process_files(file_paths):
    # Create an RDD for all files
    rdd = sc.textFile(",".join(file_paths))

    # Process the text
    words = (rdd.flatMap(lambda line: clean_text(line).split())  # Split lines into words
             .filter(lambda word: word not in STOPWORDS and len(word) > 0)  # Remove stopwords and empty words
             .map(lambda word: (word, 1))  # Map words to (word, 1)
             .reduceByKey(lambda a, b: a + b)  # Reduce by key to count occurrences
             .sortBy(lambda x: -x[1]))  # Sort by count in descending order

    # Convert RDD to DataFrame
    words_df = spark.createDataFrame(words, ["word", "count"])
    words_df = words_df.sort(desc("count"))  # Sort DataFrame by count in descending order

    # Show top 25 results
    words_df.show(25, truncate=False)

# Run the processing on your documents
file_paths = ["/content/doc1.txt", "/content/doc2.txt"]
process_files(file_paths)

import subprocess

# Command to run spark-submit
command = ["spark-submit", "extended_word_count.py"]

# Execute the command
subprocess.run(command)






+-----------+-----+
|word       |count|
+-----------+-----+
|word       |15   |
|or         |7    |
|words      |7    |
|text       |6    |
|count      |5    |
|may        |5    |
|be         |4    |
|when       |3    |
|used       |3    |
|counts     |3    |
|details    |3    |
|variations |3    |
|as         |3    |
|are        |3    |
|space      |3    |
|sources    |3    |
|definitions|3    |
|processing |3    |
|this       |3    |
|definition |3    |
|how        |3    |
|consensus  |3    |
|from       |2    |
|counting   |2    |
|also       |2    |
+-----------+-----+
only showing top 25 rows



CompletedProcess(args=['spark-submit', 'extended_word_count.py'], returncode=2)