### Examples Of Pyspark ML

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [2]:
## Read The dataset
training = spark.read.csv('fx_rates_sample.csv',header=True,inferSchema=True)

In [3]:
training.show()



+-------------------+----+---+--------+----------+--------+
|          timestamp|base|USD|     SAR|       YER|     EUR|
+-------------------+----+---+--------+----------+--------+
|2025-09-01 09:00:00| USD|1.0|3.750305|249.969495|0.849856|
|2025-09-01 09:01:00| USD|1.0|3.749265|250.046174|0.851548|
|2025-09-01 09:02:00| USD|1.0|3.750015|250.246139|0.848127|
|2025-09-01 09:03:00| USD|1.0|3.750956|250.034432|0.845882|
|2025-09-01 09:04:00| USD|1.0|3.749005| 250.00943|0.844497|
|2025-09-01 09:05:00| USD|1.0|3.747703|250.305721|0.846689|
|2025-09-01 09:06:00| USD|1.0| 3.74783|250.157004|0.847113|
|2025-09-01 09:07:00| USD|1.0|3.747514|249.992554|0.848264|
|2025-09-01 09:08:00| USD|1.0|3.747497|250.033015|0.846554|
|2025-09-01 09:09:00| USD|1.0|3.746644|250.201892|0.844874|
|2025-09-01 09:10:00| USD|1.0|3.747524|250.204177|0.845546|
|2025-09-01 09:11:00| USD|1.0|3.748301|250.469969|0.845634|
|2025-09-01 09:12:00| USD|1.0|3.748368|250.641328|0.846457|
|2025-09-01 09:13:00| USD|1.0|3.749495|2

In [4]:
training.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- base: string (nullable = true)
 |-- USD: double (nullable = true)
 |-- SAR: double (nullable = true)
 |-- YER: double (nullable = true)
 |-- EUR: double (nullable = true)



In [5]:
training.columns

['timestamp', 'base', 'USD', 'SAR', 'YER', 'EUR']

In [6]:
# [Age,Experience]----> new feature--->independent feature

In [7]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(
    inputCols=['USD','SAR','EUR'],   # كلّها موجودة و رقمية
    outputCol='features')

In [8]:
output=featureassembler.transform(training)

In [9]:
output.show()

+-------------------+----+---+--------+----------+--------+--------------------+
|          timestamp|base|USD|     SAR|       YER|     EUR|            features|
+-------------------+----+---+--------+----------+--------+--------------------+
|2025-09-01 09:00:00| USD|1.0|3.750305|249.969495|0.849856|[1.0,3.750305,0.8...|
|2025-09-01 09:01:00| USD|1.0|3.749265|250.046174|0.851548|[1.0,3.749265,0.8...|
|2025-09-01 09:02:00| USD|1.0|3.750015|250.246139|0.848127|[1.0,3.750015,0.8...|
|2025-09-01 09:03:00| USD|1.0|3.750956|250.034432|0.845882|[1.0,3.750956,0.8...|
|2025-09-01 09:04:00| USD|1.0|3.749005| 250.00943|0.844497|[1.0,3.749005,0.8...|
|2025-09-01 09:05:00| USD|1.0|3.747703|250.305721|0.846689|[1.0,3.747703,0.8...|
|2025-09-01 09:06:00| USD|1.0| 3.74783|250.157004|0.847113|[1.0,3.74783,0.84...|
|2025-09-01 09:07:00| USD|1.0|3.747514|249.992554|0.848264|[1.0,3.747514,0.8...|
|2025-09-01 09:08:00| USD|1.0|3.747497|250.033015|0.846554|[1.0,3.747497,0.8...|
|2025-09-01 09:09:00| USD|1.

In [10]:
output.columns

['timestamp', 'base', 'USD', 'SAR', 'YER', 'EUR', 'features']

In [11]:
finalized_data = output.select("features","YER")


In [12]:
finalized_data.show()

+--------------------+----------+
|            features|       YER|
+--------------------+----------+
|[1.0,3.750305,0.8...|249.969495|
|[1.0,3.749265,0.8...|250.046174|
|[1.0,3.750015,0.8...|250.246139|
|[1.0,3.750956,0.8...|250.034432|
|[1.0,3.749005,0.8...| 250.00943|
|[1.0,3.747703,0.8...|250.305721|
|[1.0,3.74783,0.84...|250.157004|
|[1.0,3.747514,0.8...|249.992554|
|[1.0,3.747497,0.8...|250.033015|
|[1.0,3.746644,0.8...|250.201892|
|[1.0,3.747524,0.8...|250.204177|
|[1.0,3.748301,0.8...|250.469969|
|[1.0,3.748368,0.8...|250.641328|
|[1.0,3.749495,0.8...|250.809692|
|[1.0,3.749962,0.8...|250.920515|
|[1.0,3.749103,0.8...|251.386046|
|[1.0,3.749472,0.8...|251.345014|
|[1.0,3.748513,0.8...|250.944309|
|[1.0,3.749391,0.8...| 251.26516|
|[1.0,3.749341,0.8...| 251.17362|
+--------------------+----------+
only showing top 20 rows


In [13]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='features', labelCol='YER')
model = regressor.fit(train_data)


In [14]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

regressor = LinearRegression(featuresCol='features', labelCol='YER')
model = regressor.fit(train_data)   # هذا هو الـ LinearRegressionModel

# المعاملات والانحراف (intercept)
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)


Coefficients: [0.0,128.3369297533059,-13.608200350925234]
Intercept: -220.493039101368


In [15]:
### Prediction

predictions = model.transform(test_data)
predictions.show(5)


+--------------------+----------+------------------+
|            features|       YER|        prediction|
+--------------------+----------+------------------+
|[1.0,3.735364,0.8...|247.622996|247.37792408053454|
|[1.0,3.737291,0.8...|246.402458|247.60194571336868|
|[1.0,3.737635,0.8...|247.221087|247.70836474200965|
|[1.0,3.737676,0.8...|247.350764|247.63522971390788|
|[1.0,3.737811,0.8...|247.475882|247.70070101226614|
+--------------------+----------+------------------+
only showing top 5 rows


In [16]:
predictions = model.transform(test_data)
predictions.show(5)


+--------------------+----------+------------------+
|            features|       YER|        prediction|
+--------------------+----------+------------------+
|[1.0,3.735364,0.8...|247.622996|247.37792408053454|
|[1.0,3.737291,0.8...|246.402458|247.60194571336868|
|[1.0,3.737635,0.8...|247.221087|247.70836474200965|
|[1.0,3.737676,0.8...|247.350764|247.63522971390788|
|[1.0,3.737811,0.8...|247.475882|247.70070101226614|
+--------------------+----------+------------------+
only showing top 5 rows


In [17]:
from pyspark.ml.regression import LinearRegression

# تقسيم البيانات
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

# إنشاء النموذج وتدريبه
regressor = LinearRegression(featuresCol='features', labelCol='YER')
model = regressor.fit(train_data)

# تقييم النموذج
pred_results = model.evaluate(test_data)
