### Analyse search terms on the e-commerce web server


##### In this assignment you will download the search term data set for the e-commerce web server and run analytic queries on it.


In [1]:
# Install spark

In [1]:
!pip install pyspark
!pip install findspark
!pip install pandas



In [10]:
# Import libraries

In [2]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
# Start session

In [3]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("SparkML Ops").getOrCreate()

In [None]:
# Import SparkML libraries

In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [1]:
# Download The search term dataset from the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [15]:
!wget -N "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv"

--2022-03-07 13:17:39--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘searchterms.csv’ not modified on server. Omitting download.



In [None]:
# Load the csv into a spark dataframe

In [4]:
searchterms = pd.read_csv('data/searchterms.csv')
sdf = spark.createDataFrame(searchterms)

In [None]:
# Print the number of rows and columns
# Take a screenshot of the code and name it as shape.jpg)

In [5]:
row_count = sdf.count()
col_count = len(sdf.columns)
print(f'Total number of rows: {row_count}')
print(f'Total number of columns: {col_count}')

Total number of rows: 10000
Total number of columns: 4


In [None]:
# Print the top 5 rows
# Take a screenshot of the code and name it as top5rows.jpg)

In [6]:
print(sdf.show(5))

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows

None


In [None]:
# Find out the datatype of the column searchterm?
# Take a screenshot of the code and name it as datatype.jpg)

In [7]:
sdf.printSchema()

root
 |-- day: long (nullable = true)
 |-- month: long (nullable = true)
 |-- year: long (nullable = true)
 |-- searchterm: string (nullable = true)



In [43]:
# How many times was the term `gaming laptop` searched?
# Take a screenshot of the code and name it as gaminglaptop.jpg)

In [8]:
sdf.createOrReplaceTempView("sdf")
spark.sql("""select count(*) as gaming_laptop from sdf where searchterm='gaming laptop'""").show()

+-------------+
|gaming_laptop|
+-------------+
|          499|
+-------------+



In [None]:
# Print the top 5 most frequently used search terms?
# Take a screenshot of the code and name it as top5terms.jpg)

In [9]:
spark.sql("""select count(*), searchterm as gaming_laptop from sdf group by searchterm order by count(*) desc""").show(5)

+--------+-------------+
|count(1)|gaming_laptop|
+--------+-------------+
|    2312|mobile 6 inch|
|    2301|    mobile 5g|
|    1327|mobile latest|
|     935|       laptop|
|     896|  tablet wifi|
+--------+-------------+
only showing top 5 rows



In [None]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.gzip

In [3]:
# Load the sales forecast model.
# Take a screenshot of the code and name it as loadmodel.jpg)

In [3]:
!wget -N "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.gzip"
!tar -xvzf model.gzip

'wget' is not recognized as an internal or external command,
operable program or batch file.
tar: Error opening archive: Failed to open 'model.gzip'


In [None]:
# You need LinearRegressionModel to load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('data/sales_prediction.model')

In [66]:
# Using the sales forecast model, predict the sales for the year of 2023.
# Take a screenshot of the code and name it as forecast.jpg

In [None]:
def predict(year):
    assembler = VectorAssembler(inputCols=["year"],outputCol="features")
    data = [[year,0]]
    columns = ["year", "sales"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features', 'year')
    predictions = model.transform(__)
    predictions.select('prediction').show()

predict(2023)