In [7]:
!pip install pyspark



In [10]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Surname Length").setMaster("local")
sc = SparkContext.getOrCreate(conf=conf)

surnames_rdd = sc.textFile("surnames.txt")
short_surnames_rdd = surnames_rdd.filter(lambda x: len(x) <= 7)
short_surnames = short_surnames_rdd.collect()

print(short_surnames)

['Əliyev', 'Həsənov', 'Quliyev', 'Əhmədov', 'Abbasov', 'Cəfərov', 'Əliyev', 'Həsənov', 'Quliyev', 'Əhmədov']


In [14]:
first_letter_rdd = short_surnames_rdd.map(lambda x: x[0])
first_letters = first_letter_rdd.collect()

print(first_letters)

['Ə', 'H', 'Q', 'Ə', 'A', 'C', 'Ə', 'H', 'Q', 'Ə']


In [17]:
numbers_rdd = sc.parallelize(range(1,11))
squares_rdd = numbers_rdd.map(lambda x: x**2)

numbers = numbers_rdd.collect()
squares = squares_rdd.collect()

print(numbers)
print(squares)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [18]:
union_rdd = numbers_rdd.union(squares_rdd)
print(union_rdd.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [19]:
intersection_rdd = numbers_rdd.intersection(squares_rdd)
print(intersection_rdd.collect())

[4, 1, 9]


In [20]:
subtraction_rdd = numbers_rdd.subtract(squares_rdd)
print(subtraction_rdd.collect())

[2, 6, 8, 10, 3, 5, 7]


In [22]:
cartesian_rdd = numbers_rdd.cartesian(squares_rdd)
print(cartesian_rdd.collect())

[(1, 1), (1, 4), (1, 9), (1, 16), (1, 25), (1, 36), (1, 49), (1, 64), (1, 81), (1, 100), (2, 1), (3, 1), (2, 4), (2, 9), (3, 4), (3, 9), (2, 16), (2, 25), (2, 36), (2, 49), (3, 16), (3, 25), (3, 36), (3, 49), (2, 64), (2, 81), (2, 100), (3, 64), (3, 81), (3, 100), (4, 1), (5, 1), (6, 1), (7, 1), (4, 4), (4, 9), (5, 4), (5, 9), (6, 4), (6, 9), (7, 4), (7, 9), (4, 16), (4, 25), (4, 36), (4, 49), (5, 16), (5, 25), (5, 36), (5, 49), (6, 16), (6, 25), (6, 36), (6, 49), (7, 16), (7, 25), (7, 36), (7, 49), (4, 64), (4, 81), (4, 100), (5, 64), (5, 81), (5, 100), (6, 64), (6, 81), (6, 100), (7, 64), (7, 81), (7, 100), (8, 1), (9, 1), (10, 1), (8, 4), (8, 9), (9, 4), (9, 9), (10, 4), (10, 9), (8, 16), (8, 25), (8, 36), (8, 49), (9, 16), (9, 25), (9, 36), (9, 49), (10, 16), (10, 25), (10, 36), (10, 49), (8, 64), (8, 81), (8, 100), (9, 64), (9, 81), (9, 100), (10, 64), (10, 81), (10, 100)]


In [27]:
with open('doc1.TXT', 'r') as file:
  doc1 = file.read()

with open('doc2.txt', 'r') as file:
  doc2 = file.read()


if len(doc1) > len(doc2):
  original_doc = doc1
else:
  original_doc = doc2

print(f"The original document is:\n\n{original_doc}")

The original document is:

Good
Morning
Good
Evening
Good
Day
Happy
Birthday
Happy
New Year
Good
Morning
Good
Evening
Good
Day
Happy
Birthday
Happy
New Year


In [30]:
longest_surname = surnames_rdd.map(lambda x: len(x)).max()
# longest_surname = surnames_rdd.reduce(lambda x, y: x if len(x) > len(y) else y)

longest_surname

10

In [31]:
from pyspark.sql import SparkSession

In [33]:
spark = SparkSession.builder.appName("Hepatitis Analysis").getOrCreate()

hepatitis_df = spark.read.csv("hepatitis.csv", header=True, inferSchema=True)

avg_age_males = hepatitis_df.filter(hepatitis_df["Sex"] == "male").agg({'Age': 'avg'}).collect()[0][0]

avg_age_males

41.5

In [40]:
hepatitis_df.createOrReplaceTempView("hepatitis")

distinct_ages = spark.sql("select distinct Age from hepatitis").collect()

[row['Age'] for row in distinct_ages]

[31,
 65,
 53,
 78,
 34,
 28,
 27,
 26,
 44,
 22,
 47,
 52,
 40,
 20,
 57,
 54,
 48,
 64,
 41,
 43,
 37,
 61,
 72,
 35,
 59,
 23,
 39,
 49,
 7,
 51,
 69,
 50,
 45,
 38,
 25,
 24,
 70,
 62,
 32,
 60,
 56,
 58,
 33,
 42,
 30,
 66,
 67,
 46,
 36]

# New section