<a href="https://colab.research.google.com/github/blacdevl/python-notebooks-intern/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import Word2Vec

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=2df4eb9af9d42b929d23babc8e71f0ac33d0cd54315974b892e37e863a78e2c3
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
!pip install kaggle



In [3]:
!chmod 600 kaggle.json

chmod: cannot access 'kaggle.json': No such file or directory


In [4]:
!kaggle datasets download -d quora/question-pairs-dataset

import zipfile
zip_ref = zipfile.ZipFile('question-pairs-dataset.zip', 'r')
zip_ref.extractall()

Dataset URL: https://www.kaggle.com/datasets/quora/question-pairs-dataset
License(s): other
Downloading question-pairs-dataset.zip to /content
 72% 15.0M/20.8M [00:00<00:00, 153MB/s]
100% 20.8M/20.8M [00:00<00:00, 175MB/s]


In [5]:
!pyspark --version
spark = SparkSession.builder.appName('embeddings').getOrCreate()

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.5.1
      /_/
                        
Using Scala version 2.12.18, OpenJDK 64-Bit Server VM, 11.0.22
Branch HEAD
Compiled by user heartsavior on 2024-02-15T11:24:58Z
Revision fd86f85e181fc2dc0f50a096855acf83a6cc5d9c
Url https://github.com/apache/spark
Type --help for more information.


In [6]:
file_path = 'questions.csv'

df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show(4)

+---+----+----+--------------------+--------------------+------------+
| id|qid1|qid2|           question1|           question2|is_duplicate|
+---+----+----+--------------------+--------------------+------------+
|  0|   1|   2|What is the step ...|What is the step ...|           0|
|  1|   3|   4|What is the story...|What would happen...|           0|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|
+---+----+----+--------------------+--------------------+------------+
only showing top 4 rows



In [12]:
df_subset1 = df.select('question1').alias('text').na.drop()
df_subset2 = df.select('question2').alias('text').na.drop()

df_concat = df_subset1.union(df_subset2)

df_concat = df_concat.withColumn("text", F.split("question1", "\\s+")).select("text")

# df_concat = df_concat.withColumn("text",F.lower("question1"))

df_concat.show(4)
df_concat.printSchema()



df_concat = df_concat.limit(100000)

print("size:",df_concat.count())


+--------------------+
|                text|
+--------------------+
|[What, is, the, s...|
|[What, is, the, s...|
|[How, can, I, inc...|
|[Why, am, I, ment...|
+--------------------+
only showing top 4 rows

root
 |-- text: array (nullable = true)
 |    |-- element: string (containsNull = false)

size: 100000


In [13]:
word2vec = Word2Vec(inputCol="text", minCount=1, outputCol="result",vectorSize=100)

model = word2vec.fit(df_concat)
result = model.transform(df_concat)
result.show()

+--------------------+--------------------+
|                text|              result|
+--------------------+--------------------+
|[What, is, the, s...|[-0.1542745852576...|
|[What, is, the, s...|[0.00859365562064...|
|[How, can, I, inc...|[-0.0053611522806...|
|[Why, am, I, ment...|[0.04183747636323...|
|[Which, one, diss...|[-0.0675903459509...|
|[Astrology:, I, a...|[0.01175305555443...|
|[Should, I, buy, ...|[-0.0678857872262...|
|[How, can, I, be,...|[0.03511635842733...|
|[When, do, you, u...|[0.01389584728167...|
|[Motorola, (compa...|[0.01870118050525...|
|[Method, to, find...|[0.03135813966380...|
|[How, do, I, read...|[0.02046062300602...|
|[What, can, make,...|[0.02567567410213...|
|[What, was, your,...|[-0.0890916447554...|
|[What, are, the, ...|[-0.1035135900340...|
|[What, would, a, ...|[0.01424646216134...|
|[What, does, mani...|[0.10364335286431...|
|[Why, do, girls, ...|[0.00143174501135...|
|[Why, are, so, ma...|[0.07948578169037...|
|[Which, is, the, ...|[-0.233749

In [30]:
sentences = ["Sun is bright", "Sun is hot", "How to make friends", "What should I do"]

try_df = spark.createDataFrame([(sentence,) for sentence in sentences],["text"])

try_df = try_df.withColumn("text", F.split("text", "\\s+")).select("text")

try_result = model.transform(try_df)

try_result.show()




+--------------------+--------------------+
|                text|              result|
+--------------------+--------------------+
|   [Sun, is, bright]|[-0.1394819604853...|
|      [Sun, is, hot]|[-0.1099189507464...|
|[How, to, make, f...|[-0.0120532852597...|
|[What, should, I,...|[0.02040313184261...|
+--------------------+--------------------+



In [31]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import DoubleType

def cosine_similarity(v1, v2):
    dot_product = float(v1.dot(v2))
    norm_v1 = float(v1.norm(2))
    norm_v2 = float(v2.norm(2))
    return dot_product / (norm_v1 * norm_v2)

cosine_similarity_udf = F.udf(cosine_similarity, DoubleType())

test = try_result.toDF("text2", "result2")

test.show()
ex_cross = try_result.crossJoin(test)

ex_cross.show()
print(ex_cross.count())
# ex_cross = ex_cross.filter(F.col("text") != F.col("text2"))

ex_cross = ex_cross.withColumn("cos_sim", cosine_similarity_udf(F.col("result"), F.col("result2")))

ex_cross.show()

+--------------------+--------------------+
|               text2|             result2|
+--------------------+--------------------+
|   [Sun, is, bright]|[-0.1394819604853...|
|      [Sun, is, hot]|[-0.1099189507464...|
|[How, to, make, f...|[-0.0120532852597...|
|[What, should, I,...|[0.02040313184261...|
+--------------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+
|                text|              result|               text2|             result2|
+--------------------+--------------------+--------------------+--------------------+
|   [Sun, is, bright]|[-0.1394819604853...|   [Sun, is, bright]|[-0.1394819604853...|
|   [Sun, is, bright]|[-0.1394819604853...|      [Sun, is, hot]|[-0.1099189507464...|
|      [Sun, is, hot]|[-0.1099189507464...|   [Sun, is, bright]|[-0.1394819604853...|
|      [Sun, is, hot]|[-0.1099189507464...|      [Sun, is, hot]|[-0.1099189507464...|
|   [Sun, is, bright]|[-0.1394819604853...|[H