In [2]:
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [4]:
df_pyspark = spark.read.csv('test1.csv', inferSchema=True, header=True)

In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
df_pyspark.select('age').show()

+---+
|age|
+---+
| 31|
| 30|
| 29|
| 24|
| 21|
| 23|
+---+



In [7]:
df_pyspark.withColumn("Experience after 10 years", df_pyspark['experience']+10)

DataFrame[Name: string, age: int, Experience: int, Salary: int, Experience after 10 years: int]

In [8]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
df_pyspark = df_pyspark.drop('Experience after 10 years')

In [10]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [11]:
df_pyspark =df_pyspark.withColumnRenamed('experience', 'experience after 10 years')

In [12]:
df_pyspark.show()

+---------+---+-------------------------+------+
|     Name|age|experience after 10 years|Salary|
+---------+---+-------------------------+------+
|    Krish| 31|                       10| 30000|
|Sudhanshu| 30|                        8| 25000|
|    Sunny| 29|                        4| 20000|
|     Paul| 24|                        3| 20000|
|   Harsha| 21|                        1| 15000|
|  Shubham| 23|                        2| 18000|
+---------+---+-------------------------+------+



In [13]:
df_pyspark = spark.read.csv('test2.csv', inferSchema=True, header=True)

In [14]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [15]:
df_pyspark.na.drop(how='any', thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
+---------+----+----------+------+



In [16]:
df_pyspark.na.drop(how='any', subset=['Salary']).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
+---------+----+----------+------+



In [17]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(a) for a in ['Age of Employee', 'Experience (in years)', 'Salary (per month - $)']]
).setStrategy("mean")

In [18]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------------------+-----------------------------+------------------------------+
|     Name| age|Experience|Salary|Age of Employee_imputed|Experience (in years)_imputed|Salary (per month - $)_imputed|
+---------+----+----------+------+-----------------------+-----------------------------+------------------------------+
|    Krish|  31|        10| 30000|                     31|                           10|                         30000|
|Sudhanshu|  30|         8| 25000|                     30|                            8|                         25000|
|    Sunny|  29|         4| 20000|                     29|                            4|                         20000|
|     Paul|  24|         3| 20000|                     24|                            3|                         20000|
|   Harsha|  21|         1| 15000|                     21|                            1|                         15000|
|  Shubham|  23|         2| 18000|      

In [19]:

df_pyspark = spark.read.csv('test1.csv', inferSchema=True, header=True)

In [20]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [21]:
df_pyspark.filter("Salary<=20000").select(['name', 'salary']).show()

+-------+------+
|   name|salary|
+-------+------+
|  Sunny| 20000|
|   Paul| 20000|
| Harsha| 15000|
|Shubham| 18000|
+-------+------+



In [25]:
df_pyspark.filter(~(df_pyspark['Salary']<=20000)).select(['name', 'salary']).show()

+---------+------+
|     name|salary|
+---------+------+
|    Krish| 30000|
|Sudhanshu| 25000|
+---------+------+



In [26]:
df_pyspark = spark.read.csv('test3.csv', inferSchema=True, header=True)

In [27]:
df_pyspark.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [30]:
df_pyspark.groupBy('salary').mean().show()

+------+-----------+
|salary|avg(salary)|
+------+-----------+
|  3000|     3000.0|
|  4000|     4000.0|
|  5000|     5000.0|
| 10000|    10000.0|
| 20000|    20000.0|
|  2000|     2000.0|
+------+-----------+



In [31]:
df_pyspark.groupBy('salary').count().show()

+------+-----+
|salary|count|
+------+-----+
|  3000|    1|
|  4000|    2|
|  5000|    2|
| 10000|    3|
| 20000|    1|
|  2000|    1|
+------+-----+



In [34]:
training = spark.read.csv('test1.csv', inferSchema=True, header=True)

In [39]:
training.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [41]:
training.columns

['Name', 'age', 'Experience', 'Salary']

In [42]:
from pyspark.ml.feature import VectorAssembler

In [46]:
featureassembler = VectorAssembler(inputCols=['age', 'Experience'], outputCol='Independent Features')

In [47]:
output = featureassembler.transform(training)

In [48]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



In [49]:
finalized_data = output.select('Independent Features', 'Salary')

In [50]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [54]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [55]:
regressor.coefficients

DenseVector([-90.5483, 1608.7819])

In [56]:
regressor.intercept

16079.136690647425

In [57]:
pred_results = regressor.evaluate(test_data)

In [58]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [23.0,2.0]| 18000|17214.09079632846|
+--------------------+------+-----------------+

