# Analyze search terms on the e-commerce web server


##### Download the search term data set for the e-commerce web server and run analytic queries on it.

# Install Apache Spark

In [1]:
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.4.4.tar.gz (311.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.4/311.4 MB[0m [31m998.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.4.4-py2.py3-none-any.whl size=311905466 sha256=ffa37d2812c0e6c3502ea56ecd6c55e01a65b58c36e23db42e9a6e20de637a72
  Stored in directory: /home/jupyterlab/.cache/pip/wheels/4e/66/db/939eb1c49afb8a7fd2c4e393ad34e12b77db67bb4cc974c00e
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.4.4
C

## Import libraries

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# Start session

In [4]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Analyze search terms data from the e-commerce web server").getOrCreate()

25/06/18 17:42:37 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Download the [search term dataset](https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv)

In [5]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

--2025-06-18 15:41:00--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233457 (228K) [text/csv]
Saving to: ‘searchterms.csv’


2025-06-18 15:41:00 (53.8 MB/s) - ‘searchterms.csv’ saved [233457/233457]



## Load the csv into a spark dataframe

In [5]:
df = spark.read.csv("searchterms.csv", header=True, inferSchema=True)

## Print the number of rows and columns

In [6]:
rowcount = df.count()
columncount = len(df.columns)
print(f"Number of rows    = {rowcount}")
print(f"Number of columns = {columncount}")

Number of rows    = 10000
Number of columns = 4


## Print the top 5 rows

In [7]:
df.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



## Find out the datatype of the column searchterm

In [8]:
searchterm_type = df.schema['searchterm'].dataType
print(f"Data type of 'searchterm': {searchterm_type}")

Data type of 'searchterm': StringType


## Find how many times was the term `gaming laptop` searched

In [9]:
df.filter(df['searchterm'] == "gaming laptop").count()

499

## Print the top 5 most frequently used search terms

In [10]:
from pyspark.sql.functions import col, desc

df.groupBy("searchterm").count().orderBy(desc("count")).show(5)



+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+
only showing top 5 rows



                                                                                

## Pretrained sales forecasting model

### Download the [pretrained sales forecasting model](https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz) 

In [11]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
!tar -xvzf model.tar.gz 

--2025-06-18 17:47:59--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1490 (1.5K) [application/x-tar]
Saving to: ‘model.tar.gz.1’


2025-06-18 17:47:59 (11.5 MB/s) - ‘model.tar.gz.1’ saved [1490/1490]

sales_prediction.model/
sales_prediction.model/metadata/
sales_prediction.model/metadata/part-00000
sales_prediction.model/metadata/.part-00000.crc
sales_prediction.model/metadata/_SUCCESS
sales_prediction.model/metadata/._SUCCESS.crc
sales_prediction.model/data/
sales_prediction.model/data/part-00000-1db9fe2f-4d93-4b1f-966b-

## Load the sales forecast model

In [16]:
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler

model = LinearRegressionModel.load("sales_prediction.model")

## Use sales forecast model
- Using the sales forecast model to predict the sales for the year of 2023

In [15]:
import json

with open("./sales_prediction.model/metadata/part-00000", "r") as f:
    metadata = json.load(f)

print(json.dumps(metadata, indent=2))

{
  "class": "org.apache.spark.ml.regression.LinearRegressionModel",
  "timestamp": 1647406488992,
  "sparkVersion": "2.4.3",
  "uid": "LinearRegression_6d5736f3dbe7",
  "paramMap": {
    "featuresCol": "features",
    "labelCol": "sales",
    "maxIter": 100,
    "regParam": 0.1
  },
  "defaultParamMap": {
    "featuresCol": "features",
    "elasticNetParam": 0.0,
    "predictionCol": "prediction",
    "loss": "squaredError",
    "labelCol": "label",
    "aggregationDepth": 2,
    "epsilon": 1.35,
    "maxIter": 100,
    "regParam": 0.0,
    "fitIntercept": true,
    "standardization": true,
    "tol": 1e-06,
    "solver": "auto"
  }
}


In [17]:
def predict(year):
    assembler = VectorAssembler(inputCols=["year"], outputCol="features")  
    data = [[year, 0]]  
    columns = ["year", "sales"]  
    df = spark.createDataFrame(data, columns)
    transformed_df = assembler.transform(df).select('features', 'sales') 
    predictions = model.transform(transformed_df)
    predictions.select('prediction').show()

In [18]:
predict(2023)

                                                                                

+------------------+
|        prediction|
+------------------+
|175.16564294006457|
+------------------+



25/06/18 18:05:46 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
25/06/18 18:05:46 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
