<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_OneVsRest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Kaggle modules and download the dataset

from google.colab import drive
drive.mount('/content/drive')

!pip install kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/kaggle'
!kaggle competitions download -c facebook-recruiting-iii-keyword-extraction
!unzip -q "/content/facebook-recruiting-iii-keyword-extraction.zip"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Downloading facebook-recruiting-iii-keyword-extraction.zip to /content
100% 2.90G/2.90G [00:40<00:00, 59.0MB/s]
100% 2.90G/2.90G [00:40<00:00, 76.2MB/s]


In [None]:
!unzip -q "/content/Test.zip"

In [None]:
!unzip -q "/content/Train.zip"

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col, split
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, OneVsRest, OneVsRestModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1. Start Spark session
spark = SparkSession.builder.appName("KeywordExtraction").getOrCreate()

# 2. Load train dataset
train_df = spark.read.csv("Train.csv", header=True, inferSchema=True, multiLine=True, escape='"')
train_df = train_df.na.drop(subset=["Title", "Body", "Tags"])

# 3. Combine Title and Body into one text column
train_df = train_df.withColumn("text", concat_ws(" ", "Title", "Body"))

# 4. Simplify: Use first tag only (for single-label classification)
train_df = train_df.withColumn("tag", split(col("Tags"), " ").getItem(0))

# 5. StringIndexer for tags
label_indexer = StringIndexer(inputCol="tag", outputCol="label").fit(train_df)

# 6. NLP Pipeline: Tokenizer → StopWordsRemover → TF-IDF
tokenizer = Tokenizer(inputCol="text", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

# 7. Base classifier and OneVsRest wrapper
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.1)
ovr = OneVsRest(classifier=lr, labelCol="label", featuresCol="features")

# 8. Pipeline: from raw text to model
pipeline = Pipeline(stages=[
    tokenizer,
    stopwords_remover,
    hashing_tf,
    idf,
    label_indexer,
    ovr
])

# 9. Train-test split
train_data, test_data = train_df.randomSplit([0.8, 0.2], seed=42)

# 10. Fit model
model = pipeline.fit(train_data)

# 11. Predict and evaluate on test
predictions = model.transform(test_data)

# Initialize evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# Compute metrics
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

# Print all metrics
print("✅ Evaluation Metrics (OneVsRest Classifier):")
print(f"• Accuracy:  {accuracy:.4f}")
print(f"• F1-score:  {f1:.4f}")
print(f"• Precision: {precision:.4f}")
print(f"• Recall:    {recall:.4f}")

# 12. Save the model
model.write().overwrite().save("keyword_ovr_model")
print("💾 Model saved to 'keyword_ovr_model'")

# 13. Load test.csv (without Tags)
test_df = spark.read.csv("Test.csv", header=True, inferSchema=True, multiLine=True, escape='"')
test_df = test_df.na.drop(subset=["Title", "Body"])
test_df = test_df.withColumn("text", concat_ws(" ", "Title", "Body"))

# 14. Load trained model and predict
loaded_model = Pipeline.load("keyword_ovr_model")
test_predictions = loaded_model.transform(test_df)

# 15. Output predictions with IDs
from pyspark.sql.functions import monotonically_increasing_id
predicted_labels = test_predictions.select("Id", "prediction")

# Optional: Map predictions back to original tags (inverse of StringIndexer)
labels = label_indexer.labels
mapping_expr = udf(lambda idx: labels[int(idx)], "string")
final_predictions = predicted_labels.withColumn("PredictedTag", mapping_expr("prediction"))

# 16. Save submission file
final_predictions.select("Id", "PredictedTag").coalesce(1).write.csv("submission_keywords", header=True, mode="overwrite")
print("📤 Submission written to 'submission_keywords/'")
