In [57]:
%%init_spark
launcher.jars = ["/app/setup/commons.jar"]
launcher.conf.spark.app.name = "tej_scratch_3"
launcher.conf.spark.local.dir = "/data/tmp/spark" 
launcher.conf.spark.sql.shuffle.partitions = 200  
launcher.conf.spark.sql.shuffle.minPartitions = 20 
launcher.conf.spark.driver.memory = "8g" 
launcher.conf.spark.executor.memory = "3g" 
launcher.conf.spark.ui.showConsoleProgress = "true"
launcher.master = "local[5]"

In [58]:
var df_tq = spark.read.option("header",true).csv("file:////app/notebooks/avinash/SpellCheck-test-data/lumos-0.8.7_spell_check_test_data.csv")

df_tq: org.apache.spark.sql.DataFrame = [_c0: string, query: string ... 1 more field]


In [59]:
var original_df = spark.read.option("header",true).csv("/data1/archive/avinash/SearchTests/TestData/SpellCheck/spell_check_test_data.csv")

original_df: org.apache.spark.sql.DataFrame = [query: string, mistake_queries: string]


In [60]:
df_tq.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- query: string (nullable = true)
 |-- corrected_query_word_seg: string (nullable = true)



In [61]:
df_tq.show()

25/02/11 11:07:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , query, corrected_query_word_seg
 Schema: _c0, query, corrected_query_word_seg
Expected: _c0 but found: 
CSV file: file:///app/notebooks/avinash/SpellCheck-test-data/lumos-0.8.7_spell_check_test_data.csv
+---+--------------------+------------------------+
|_c0|               query|corrected_query_word_seg|
+---+--------------------+------------------------+
|  0|   satchels jandbags|       satchels handbags|
|  1|flooral printed saee|     floral print sarees|
|  2|021-black shiort ...|      32b lace short top|
|  3|61l 2eague casual...|    61l league casual...|
|  4|        9 impreseaon|            9 impression|
|  5|a-lii dresses in ...|    ali dresses in ki...|
|  6|a-line dresses in...|    a line dress in v...|
|  7|abstroct shrts in...|    abstract shorts i...|
|  8|abstract ties in ...|    abstract ties in ...|
|  9|aai occaseeon fla...|    acai occasion fla...|
| 10|adamo timwaar & e...|

In [62]:
df_tq = df_tq.join(original_df,df_tq("query")===original_df("mistake_queries"),"inner").drop("_c0").drop(df_tq("query"))

df_tq: org.apache.spark.sql.DataFrame = [corrected_query_word_seg: string, query: string ... 1 more field]


In [63]:
df_tq.show()

+------------------------+--------------------+--------------------+
|corrected_query_word_seg|               query|     mistake_queries|
+------------------------+--------------------+--------------------+
|       satchels handbags|   satchels handbags|   satchels jandbags|
|    ali dresses in ki...|a-line dresses in...|a-lii dresses in ...|
|    abstract ties in ...|abstract ties in ...|abstract ties in ...|
|    airforce blue cot...|airforce blue cot...|airforce blae cot...|
|    aj dezines kurta set|aj dezines kurta ...|aj dezines kurta sts|
|           aldo footwear|       aldo footwear|        uldo footwer|
|      anouk western wear|  amous western wear|   amous wstern woar|
|        anise star girls|    anise-star girls|   anise-qstar girls|
|    aqua green tops f...|aqua green tops f...|aqua graentops fo...|
|          assembly women|      assembly women|       essembly woen|
|               flax vest|       athflex vests|        thflax vests|
|     tec series or women|aztec sa

In [64]:
df_tq = df_tq.withColumn("mistake_distance",levenshtein(col("query"),col("mistake_queries")))

df_tq: org.apache.spark.sql.DataFrame = [corrected_query_word_seg: string, query: string ... 2 more fields]


In [65]:
df_tq = df_tq.withColumn("correction_distance",levenshtein(col("query"),col("corrected_query_word_seg")))

df_tq: org.apache.spark.sql.DataFrame = [corrected_query_word_seg: string, query: string ... 3 more fields]


In [66]:
df_tq =df_tq.filter(col("correction_distance") > 4)

df_tq: org.apache.spark.sql.DataFrame = [corrected_query_word_seg: string, query: string ... 3 more fields]


In [68]:
df_tq.show(false)

+---------------------------+-------------------------+--------------------------+----------------+-------------------+
|corrected_query_word_seg   |query                    |mistake_queries           |mistake_distance|correction_distance|
+---------------------------+-------------------------+--------------------------+----------------+-------------------+
|ali dresses in kimayra     |a-line dresses in komarri|a-lii dresses in komarri  |2               |6                  |
|flax vest                  |athflex vests            |thflax vests              |2               |5                  |
|tec series or women        |aztec sarees for women   |ztec sarees or women      |2               |5                  |
|beige not shapers          |beige net shapewear      |beige net shapewir        |2               |5                  |
|blue summer in monrow      |blue sneakers in monrow  |bluesneaers in monrow     |2               |5                  |
|removal co ord sets        |cream novel

In [52]:
df_tq.printSchema()

root
 |-- corrected_query_word_seg: string (nullable = true)
 |-- query: string (nullable = true)
 |-- mistake_queries: string (nullable = true)
 |-- mistake_distance: integer (nullable = true)
 |-- correction_distance: integer (nullable = true)



In [53]:
df_tq.select(avg("correction_distance")).show()

+------------------------+
|avg(correction_distance)|
+------------------------+
|      1.8940520446096654|
+------------------------+



In [54]:
df_tq.select(avg("mistake_distance")).show()

+---------------------+
|avg(mistake_distance)|
+---------------------+
|   1.8197026022304832|
+---------------------+



In [56]:
df_tq.select("correction_distance").describe().show()

+-------+-------------------+
|summary|correction_distance|
+-------+-------------------+
|  count|                538|
|   mean| 1.8940520446096654|
| stddev| 1.9552540908204048|
|    min|                  0|
|    max|                  9|
+-------+-------------------+

