In [12]:
!pip install pyspark



# Create CSV

In [13]:
import os

filename = "call_records.csv"

csv_data = """call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
"""

with open(filename, "w") as f:
    f.write(csv_data)

# Load and set RDD

In [15]:

from pyspark import SparkContext

sc = SparkContext.getOrCreate()

lines = sc.textFile("call_records.csv")

header = lines.first()
data_rdd = lines.filter(lambda row: row != header)

split_rdd = data_rdd.map(lambda row: row.split(","))

typed_rdd = split_rdd.map(lambda x: (
    x[0],
    x[1],
    x[2],
    x[3],
    x[4],
    int(x[5]),
    float(x[6])
))

print(typed_rdd.take(5))

[('C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', 180, 2.5), ('C002', 'Neha', 'Arjun', 'Bangalore', 'STD', 320, 6.0), ('C003', 'Rahul', 'Pooja', 'Delhi', 'Local', 60, 1.0), ('C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', 900, 25.0), ('C005', 'Arjun', 'Amit', 'Chennai', 'STD', 400, 7.5)]


# Read the CSV file using sparkContext.textFile and display the first 5 records.

In [16]:
lines = sc.textFile("call_records.csv")

first_five = lines.take(5)
for record in first_five:
    print(record)


call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0


# Remove the header row and create a clean RDD containing only data rows.

In [17]:
header = lines.first()
data_rdd = lines.filter(lambda row: row != header)
print(data_rdd.take(5))

['C001,Amit,Rahul,Hyderabad,Local,180,2.5', 'C002,Neha,Arjun,Bangalore,STD,320,6.0', 'C003,Rahul,Pooja,Delhi,Local,60,1.0', 'C004,Pooja,Neha,Mumbai,ISD,900,25.0', 'C005,Arjun,Amit,Chennai,STD,400,7.5']


# Split each row into individual fields using a delimiter.

In [18]:
split_rdd = data_rdd.map(lambda row: row.split(","))

print(split_rdd.take(5))

[['C001', 'Amit', 'Rahul', 'Hyderabad', 'Local', '180', '2.5'], ['C002', 'Neha', 'Arjun', 'Bangalore', 'STD', '320', '6.0'], ['C003', 'Rahul', 'Pooja', 'Delhi', 'Local', '60', '1.0'], ['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'], ['C005', 'Arjun', 'Amit', 'Chennai', 'STD', '400', '7.5']]


# Calculate the total call cost per city.

In [19]:
city_cost_rdd = split_rdd.map(lambda x: (x[3], float(x[6])))

In [20]:
city_cost_rdd

PythonRDD[32] at RDD at PythonRDD.scala:56

# Identify the city with the highest total call cost.

In [22]:
total_cost_per_city = city_cost_rdd.reduceByKey(lambda a, b: a + b)

In [24]:
highest_city = total_cost_per_city.max(key=lambda x: x[1])
print( highest_city)

('Mumbai', 88.8)


# Calculate the total call duration per call type (Local, STD, ISD).

In [25]:
calltype_duration_rdd = split_rdd.map(lambda x: (x[4], int(x[5])))

In [26]:
total_duration_per_calltype = calltype_duration_rdd.reduceByKey(lambda a, b: a + b)

In [27]:
print(total_duration_per_calltype.collect())

[('Local', 1350), ('STD', 2640), ('ISD', 3950)]


# Count the number of calls per city.

In [28]:
city_calls_rdd = split_rdd.map(lambda x: (x[3], 1))
calls_per_city = city_calls_rdd.reduceByKey(lambda a, b: a + b)

In [29]:
print(calls_per_city.collect())

[('Hyderabad', 4), ('Delhi', 5), ('Mumbai', 4), ('Bangalore', 4), ('Chennai', 3)]


# Calculate the average call cost per city using RDD transformations.

In [30]:
city_cost_count_rdd = split_rdd.map(lambda x: (x[3], (float(x[6]), 1)))

In [31]:
city_cost_sum_count = city_cost_count_rdd.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)

In [32]:
avg_cost_per_city = city_cost_sum_count.mapValues(lambda x: x[0] / x[1])

In [33]:
print(avg_cost_per_city.collect())

[('Hyderabad', 8.95), ('Delhi', 3.8), ('Mumbai', 22.2), ('Bangalore', 4.15), ('Chennai', 5.3999999999999995)]


# Filter and list all high-value calls where call cost is greater than 20.

In [34]:
high_value_calls_rdd = split_rdd.filter(lambda x: float(x[6]) > 20)
high_value_calls_rdd.collect()

[['C004', 'Pooja', 'Neha', 'Mumbai', 'ISD', '900', '25.0'],
 ['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]

 # Count the number of ISD calls per city.

In [35]:
isd_calls_rdd = split_rdd.filter(lambda x: x[4] == "ISD")
isd_city_rdd = isd_calls_rdd.map(lambda x: (x[3], 1))
isd_calls_per_city = isd_city_rdd.reduceByKey(lambda a, b: a + b)
print(isd_calls_per_city.collect())

[('Mumbai', 3), ('Hyderabad', 1)]


# Identify the longest call based on call duration.

In [37]:
call_duration_rdd = split_rdd.map(lambda x: (x[0], int(x[5])))
longest_call = call_duration_rdd.max(key=lambda x: x[1])
print(longest_call)

('C009', 1100)


# Calculate the total revenue generated by each caller.

In [38]:
caller_cost_rdd = split_rdd.map(lambda x: (x[1], float(x[6])))
total_revenue_per_caller = caller_cost_rdd.reduceByKey(lambda a, b: a + b)
print(total_revenue_per_caller.collect())

[('Amit', 2.5), ('Pooja', 25.0), ('Karan', 2.0), ('Riya', 6.5), ('Vikas', 30.0), ('Suresh', 2.0), ('Divya', 6.8), ('Nikhil', 2.8), ('Rohit', 2.3), ('Manish', 27.0), ('Tina', 6.2), ('Neha', 6.0), ('Rahul', 1.0), ('Arjun', 7.5), ('Sneha', 3.0), ('Anjali', 1.5), ('Farhan', 7.0), ('Ayesha', 28.0), ('Priya', 7.2), ('Kavya', 2.1)]


#Detect suspicious calls based on the following rule:
*   duration greater than 900 seconds
*   cost greater than 25




In [39]:
suspicious_calls_rdd = split_rdd.filter(lambda x: int(x[5]) > 900 and float(x[6]) > 25)
suspicious_calls_rdd.collect()

[['C009', 'Vikas', 'Riya', 'Mumbai', 'ISD', '1100', '30.0'],
 ['C012', 'Ayesha', 'Farhan', 'Hyderabad', 'ISD', '950', '28.0'],
 ['C019', 'Manish', 'Tina', 'Mumbai', 'ISD', '1000', '27.0']]