# Task 1 : 
### Installing Java, Spark3+ , and a compatible pyspark Python lib and making it work.


In [3]:
# Testing pyspark installation

import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\spark-3.2.0-bin-hadoop3.2'

In [7]:
# Initiate spark context
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('SparkApp').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark =SparkSession(sc)


In [4]:
#Example Test code
numeric_val = sc.parallelize([1,2,3,4])
numeric_val.map(lambda x: x*x*x).collect()


[1, 8, 27, 64]

In [6]:
# Stop the spark session
sc.stop()

# Task 2 
### Read "Car details v3.csv" data with spark

In [9]:
df = spark.read.option("header", "true").csv ('Car details v3.csv')

In [10]:
df.limit(10).toPandas()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5


# Task 3
### Creating a model to predict the selling price from the other variables using Sparks'mlib 

In [14]:
spark = SparkSession.builder.getOrCreate()

In [15]:
spark

In [16]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- selling_price: string (nullable = true)
 |-- km_driven: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- mileage: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- max_power: string (nullable = true)
 |-- torque: string (nullable = true)
 |-- seats: string (nullable = true)



The selling_price is our target and the remainder are our features we want to predict the target with.

In [17]:
df.show(2)

+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-------------------+-----+
|                name|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|             torque|seats|
+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-------------------+-----+
|Maruti Swift Dzir...|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|     190Nm@ 2000rpm|    5|
|Skoda Rapid 1.5 T...|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp|250Nm@ 1500-2500rpm|    5|
+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-------------------+-----+
only showing top 2 rows



Let's select some columns to see how the dataframe look like.

In [18]:
df_show = df.select("year","selling_price","km_driven","fuel")

In [19]:
df_show.show(2)

+----+-------------+---------+------+
|year|selling_price|km_driven|  fuel|
+----+-------------+---------+------+
|2014|       450000|   145500|Diesel|
|2014|       370000|   120000|Diesel|
+----+-------------+---------+------+
only showing top 2 rows



## Pre-processing the data

In [21]:


#droping the name column
df=df.drop("name")
df.show(5)

+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|              torque|seats|
+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|      190Nm@ 2000rpm|    5|
|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp| 250Nm@ 1500-2500rpm|    5|
|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|12.7@ 2,700(kgm@ ...|    5|
|2010|       225000|   127000|Diesel| Individual|      Manual| First Owner| 23.0 kmpl|1396 CC|    90 bhp|22.4 kgm at 1750-...|    5|
|2007|       130000|   120000|Petrol| Individual|      Manual| First 

In [22]:
df.describe()

DataFrame[summary: string, year: string, selling_price: string, km_driven: string, fuel: string, seller_type: string, transmission: string, owner: string, mileage: string, engine: string, max_power: string, torque: string, seats: string]

In [27]:
# df["year"].unique()
df.select('seller_type').distinct().collect()

[Row(seller_type='Individual'),
 Row(seller_type='Dealer'),
 Row(seller_type='Trustmark Dealer')]

In [28]:
df.select('fuel').distinct().collect()

[Row(fuel='Diesel'), Row(fuel='CNG'), Row(fuel='LPG'), Row(fuel='Petrol')]

In [29]:
df.select('transmission').distinct().collect() 

[Row(transmission='Automatic'), Row(transmission='Manual')]

In [30]:
df.select('owner').distinct().collect() 

[Row(owner='Third Owner'),
 Row(owner='Fourth & Above Owner'),
 Row(owner='Second Owner'),
 Row(owner='First Owner'),
 Row(owner='Test Drive Car')]

In [34]:
df.select('seats').distinct().collect() 

[Row(seats='7'),
 Row(seats='8'),
 Row(seats=None),
 Row(seats='5'),
 Row(seats='6'),
 Row(seats='9'),
 Row(seats='10'),
 Row(seats='4'),
 Row(seats='14'),
 Row(seats='2')]

#### counting null value in the dataframe

In [33]:
from pyspark.sql.functions import col,sum
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+----+-------------+---------+----+-----------+------------+-----+-------+------+---------+------+-----+
|year|selling_price|km_driven|fuel|seller_type|transmission|owner|mileage|engine|max_power|torque|seats|
+----+-------------+---------+----+-----------+------------+-----+-------+------+---------+------+-----+
|   0|            0|        0|   0|          0|           0|    0|    221|   221|      215|   222|  221|
+----+-------------+---------+----+-----------+------------+-----+-------+------+---------+------+-----+



we have at least 200 null values in the columns: mileage, engine , max_power, torque  and seats

In [45]:
!pip install org.apache.spark.ml.feature
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.IntegerType

df.withColumn("year",col("year").cast("int"))

ERROR: Could not find a version that satisfies the requirement org.apache.spark.ml.feature (from versions: none)
ERROR: No matching distribution found for org.apache.spark.ml.feature


ModuleNotFoundError: No module named 'org'

In [41]:
# creating a age column
## Casting to 
df['age']=df[2021-df['year'].astype(int)]

TypeError: unexpected type: <class 'type'>

In [18]:
help(spark.read..option("header", "true").csv())

TypeError: csv() missing 1 required positional argument: 'path'