In [1]:
import findspark
findspark.init()


In [2]:
def extract_year(input_string):
    if len(input_string) < 4:
        return None
    return input_string[:4]


In [3]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
import time
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import NGram
from pyspark.ml.feature import RegexTokenizer
from pyspark.conf import SparkConf
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, size,split
from pyspark.sql.functions import abs as spark_abs, mean
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import mean, abs as spark_abs, col

In [4]:
start_time = time.time()
    
spark_conf = SparkConf()
spark_conf.set("spark.executor.instances","2")  
spark_conf.set("spark.executor.cores", "2")      

# Create SparkSession with the configured parameters
spark = SparkSession.builder \
    .appName("Spark in Jupyter") \
    .config(conf=spark_conf) \
    .getOrCreate()


# Load dataset
pandas_df = pd.read_csv('books_task.csv')
pandas_df.drop('Unnamed: 0',inplace=True,axis=1)

df = spark.createDataFrame(pandas_df)
df = df.na.drop()

#Title

df = df.withColumn("Title_Word_Count", 
                   udf(lambda x: len(x.split()), IntegerType())("Title"))

df = df.withColumn("Title_Character_Count", 
                   udf(lambda x: len(x), IntegerType())("Title"))

df = df.withColumn("Title_Avg_Word_Length", 
                   udf(lambda x: sum(len(word) for word in x.split()) / len(x.split()), DoubleType())("Title"))
#Description

df = df.withColumn("Description_Word_Count", 
                   udf(lambda x: len(x.split()), IntegerType())("description"))

df = df.withColumn("Description_Character_Count", 
                   udf(lambda x: len(x), IntegerType())("description"))

df = df.withColumn("Description_Avg_Word_Length", 
                   udf(lambda x: sum(len(word) for word in x.split()) / len(x.split()), DoubleType())("description"))
#publishedDate

extract_year_udf = udf(extract_year, StringType())
df = df.withColumn("Year", extract_year_udf("publishedDate"))
df = df.withColumn("Year", col("Year").cast("integer"))

#authors

df = df.withColumn("authors", split(col("authors"), ","))
df = df.withColumn("NumAuthors", size(col("authors")))


Processed_Featured=['Title_Word_Count','Title_Character_Count','Title_Avg_Word_Length','Description_Word_Count','Description_Character_Count','Description_Avg_Word_Length','Year','NumAuthors','Impact']

processed_df = df.select(Processed_Featured)





# Model
assembler = VectorAssembler(inputCols=['Title_Word_Count', 'Title_Character_Count', 'Title_Avg_Word_Length',
                                       'Description_Word_Count', 'Description_Character_Count',
                                       'Description_Avg_Word_Length', 'Year', 'NumAuthors'],
                            outputCol='features', handleInvalid="skip")
rf = RandomForestRegressor(featuresCol="features", labelCol="Impact")

pipeline = Pipeline(stages=[assembler, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10,]) \
    .build()

# Create a custom evaluator to calculate MAPE manually
def calculate_mape(predictions):
    return predictions.withColumn("abs_diff", spark_abs(col("Impact") - col("prediction"))) \
        .withColumn("mape", spark_abs(col("Impact") - col("prediction")) / col("Impact")) \
        .agg(mean("mape")) \
        .collect()[0][0]

# Create CrossValidator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="Impact"),
                          numFolds=3,
                          collectSubModels=True)

# Run cross-validation and choose the best set of parameters
cvModel = crossval.fit(processed_df)

training_time = time.time() - start_time
# Get best model from CrossValidator
best_model = cvModel.bestModel

# Make predictions
predictions = best_model.transform(processed_df)

# Calculate MAPE manually
mape = calculate_mape(predictions)

# Print MAPE
print("Cross-validated MAPE:", mape)


24/02/08 15:31:44 WARN Utils: Your hostname, Adityas-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.3.26 instead (on interface en0)
24/02/08 15:31:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/08 15:31:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/08 15:31:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/02/08 15:32:02 WARN TaskSetManager: Stage 0 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:11 WARN TaskSetManager: Stage 1 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:11 WARN TaskSetManager: Stage 2 contains a task of very large size (12602 KiB).

24/02/08 15:32:49 WARN TaskSetManager: Stage 91 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:50 WARN TaskSetManager: Stage 93 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:50 WARN TaskSetManager: Stage 95 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:50 WARN TaskSetManager: Stage 97 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:51 WARN TaskSetManager: Stage 99 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:52 WARN TaskSetManager: Stage 101 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:32:55 WARN TaskSetManager: Stage 102 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 Ki

24/02/08 15:33:29 WARN TaskSetManager: Stage 191 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:33:29 WARN TaskSetManager: Stage 192 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:33:30 WARN TaskSetManager: Stage 194 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:33:31 WARN TaskSetManager: Stage 196 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:33:31 WARN TaskSetManager: Stage 198 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:33:31 WARN TaskSetManager: Stage 200 contains a task of very large size (12602 KiB). The maximum recommended task size is 1000 KiB.
24/02/08 15:33:32 WARN TaskSetManager: Stage 202 contains a task of very large size (12602 KiB). The maximum recommended task size is 10

Cross-validated MAPE: 0.060670433766320656




In [5]:
mape

0.060670433766320656

In [6]:
training_time

149.67606711387634