In [6]:
%%init_spark
launcher.jars = ["/app/setup/commons-1.0.1.jar","/app/setup/config-1.4.3.jar"]
launcher.conf.spark.app.name = "tej_scratch_3"
launcher.conf.spark.local.dir = "/data/tmp/spark" 
launcher.conf.spark.sql.shuffle.partitions = 200  
launcher.conf.spark.sql.shuffle.minPartitions = 20 
launcher.conf.spark.driver.memory = "8g" 
launcher.conf.spark.executor.memory = "3g" 
launcher.conf.spark.ui.showConsoleProgress = "true"
launcher.master = "local[5]"

In [7]:
import scala.collection.mutable.{ArrayBuffer, HashMap}
import scala.util.Random
private val approxKey: HashMap[Char, String] = HashMap('q' -> "was", 'w' -> "qesad", 'e' -> "wrdsf", 'r' -> "etfdg",
    't' -> "rygfh", 'y' -> "tuhgj", 'u' -> "yijhk", 'i' -> "uojkl", 'o' -> "ipkl", 'p' -> "ol", 'a' -> "sqzwx",
    's' -> "adwxqez", 'd' -> "sfecrxw", 'f' -> "gdrvct", 'g' -> "fhtbvyr", 'h' -> "jgynbu", 'j' -> "hkunmi",
    'k' -> "jlimo", 'l' -> "kop", 'z' -> "xas", 'x' -> "zcsd", 'c' -> "xvdf", 'v' -> "bcfg", 'b' -> "vngh",
    'n' -> "mbjh", 'm' -> "njk")

  // Should be in the decreasing order of rightword-closeness (Also, semantic > spelling)
  // Lesser the number, greater is the priority
  private val variantPriority: HashMap[String, Int] = HashMap("rightword" -> 1, "word_gap_dilemma" -> 2, "plural_extend" -> 3,
    "vowel_extend" -> 4, "apostropheS" -> 5, "ies_ending" -> 6, "vowel_dilemma" -> 7,
    "butter_fingers_compulsory" -> 8, "butter_fingers" -> 9, "slangwise_wrong_usage" -> 10, "adjacent_swap" -> 11,
    "neighbours_swap" -> 12, "single_missing_letter" -> 13, "two_missing_letters" -> 14,
    "remove_consecutive_letters" -> 15, "double_press_same_key" -> 16, "wrong_letter_doubled" -> 17,
    "head_letters" -> 18, "extra_letters" -> 19, "vowel_dilemma_p_extend1" -> 20, "vowel_dilemma_p_extend2" -> 21,
    "vow_dil_single_missing" -> 22)

  private def sortByPriority(x: Array[String], y: Array[String]): Boolean = {
    variantPriority(x(1)) > variantPriority(y(1))
  }

  private def headLetters(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for (i <- 3 until (word.length - 2))
      variants += word.substring(0, i)
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.foreach(variant => variants_ += Array(variant, "head_letters"))
    variants_
  }

  def generateAllTypoVariants(word: String): Array[Array[String]] = {
    var allVariants = ArrayBuffer[Array[String]]()

    if (word.length > 2)
      allVariants ++= singleMissingLetter(word)
    if (word.length > 4)
      allVariants ++= twoMissingLetters(word)
//    allVariants ++= wordGapDilemma(word)
    val word1 = removeConsecutiveLetters(word)
    if(word1 != null && word1.length > 1)
      allVariants += word1
    allVariants ++= vowelDilemma(word)
    allVariants += vowelExtend(word)
    allVariants ++= pluralExtend(word)
    allVariants ++= vowelDilemmaAndPluralExtend1(word)
    allVariants ++= vowelDilemmaAndPluralExtend2(word)
    allVariants ++= vowelDilemmaAndSingleMissingLetter(word)
    allVariants += apostropheS(word)
    // if(word.length >3)
    //  allVariants ++= headLetters(word)
    allVariants ++= adjacentSwap(word)
    if (word.length >= 5)
      allVariants ++= neighboursSwap(word)
    allVariants ++= butterFingers(word)
    allVariants ++= butterFingersCompulsory(word)
    allVariants ++= doublePressSameKey(word)
    allVariants ++= slangWiseWrongUsage(word)
    if (word.endsWith("is"))
      allVariants += slangWiseWrongUsageTest(word)
    allVariants ++= wrongLetterDoubled(word)
    allVariants ++= extraLetters(word)

    // right2Right mapping.
    allVariants = allVariants.filterNot(x => x(0).equals(word))
    allVariants += Array(word, "rightword")

    allVariants = allVariants.filter(x => !(x(0).startsWith("-") || x(0).endsWith("-"))) // Remove terminal hyphens
    allVariants = allVariants.sortWith(sortByPriority)
    val temp =  HashMap[String,String]()
    allVariants.foreach(x=> temp += (x(0) -> x(1)))
    temp.toArray.map(_.productIterator.toArray.map(_.toString))
  }

  private def butterFingers(word: String, prob: Double = 0.2, totalVariants: Int = 100,
                            maxChanges: Int = 4): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    val rand = Random
    var changes: Int = 0

    for (i <- 0 until totalVariants) {
      var variant = ""
      for (letter <- word) {
        if (!approxKey.contains(letter)) {
          variant += letter
        }
        else {
          if(changes <= maxChanges && rand.nextFloat() <= prob) {
            val index = rand.nextInt(approxKey(letter).length)
            variant += approxKey(letter)(index)
            changes += 1
          }
          else {
            variant += letter
          }
        }
      }
      variants += variant
    }
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word))
      .foreach(variant => variants_ += Array(variant, "butter_fingers"))
    variants_
  }

  private def butterFingersCompulsory(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for(i <- 0 until word.length) {
      if(approxKey.contains(word(i)))
        for (j <- 0 until Math.min(3, approxKey(word(i)).length))
          variants += word.substring(0, i) + approxKey(word(i))(j) + word.substring(i + 1)
    }
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "butter_fingers_compulsory"))
    variants_
  }

  private def singleMissingLetter(word: String): ArrayBuffer[Array[String]] = {
    val zero = '0'.toInt
    val nine = '9'.toInt
    var variants = ArrayBuffer[String]()
    for (i <- 0 until word.length)
      if (!(zero <= word(i) && word(i) <= nine))
        variants += word.substring(0, i) + word.substring(i+1)
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "single_missing_letter"))
    variants_
  }

  private def removeConsecutiveLetters(word: String): Array[String] = {
    var atleastOne = false
    var variant: String = word(0).toString
    for (i <- 1 until word.length) {
      if (word(i) != word(i - 1))
        variant += word(i)
      else
        atleastOne = true
    }
    if (atleastOne)
      Array(variant, "remove_consecutive_letters")
    null
  }

  private def twoMissingLetters(word: String): ArrayBuffer[Array[String]] = {
    val zero = '0'.toInt
    val nine = '9'.toInt
    var variants = ArrayBuffer[String]()
    for (i <- 0 until word.length) {
      if (!(zero <= word(i) && word(i) <= nine))
        for (j <- i+1 until word.length)
          if (!(zero <= word(j) && word(j) <= nine))
            variants += word.substring(0, i) + word.substring(i+1, j) + word.substring(j+1)
    }
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "two_missing_letters"))
    variants_
  }

  private def vowelExtend(word: String): Array[String] = {
    Array(word.concat("e"), "vowel_extend")
  }

  private def pluralExtend(word: String): ArrayBuffer[Array[String]] = {
    ArrayBuffer(Array(word.concat("s"), "plural_extend"),
      Array(word.concat("es"), "plural_extend"))
  }

  private def apostropheS(word: String): Array[String] = {
    Array(word.concat("'s"), "apostropheS")
  }

  private def wordGapDilemma(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for(i <- 1 until word.length) {
      variants += word.substring(0, i) + "-" + word.substring(i)
      variants += word.substring(0, i) + "_" + word.substring(i)
    }
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "word_gap_dilemma"))
    variants_
  }

  private def doublePressSameKey(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for (i <- 0 until word.length)
      variants += word.substring(0, i + 1) + word(i) + word.substring(i + 1)
    var variants_ = ArrayBuffer[Array[String]]()
    variants.foreach(variant => variants_ += Array(variant, "double_press_same_key"))
    variants_
  }

  private def wrongLetterDoubled(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for(i <- 0 until word.length-1) {
      if(word(i) == word(i+1)) {
        for(j <- 0 until i)
          variants += word.substring(0, j) + word(j) + word(j) + word.substring(j+1, i) + word.substring(i+1)
        for(j <- i+1 until word.length)
          variants += word.substring(0, i) + word.substring(i+1, j) + word(j) + word(j) + word.substring(j+1)
      }
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "wrong_letter_doubled"))
    variants_
  }

  private def vowelDilemma(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for(i <- 0 until word.length) {
      if (word(i) == 'a') {
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
        variants += word.substring(0, i) + "u" + word.substring(i+1)
      }
      else if (word(i) == 'e') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "i" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
      }
      else if (word(i) == 'i') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "ee" + word.substring(i+1)
        variants += word.substring(0, i) + "ea" + word.substring(i+1)
      }
      else if (word(i) == 'o') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "u" + word.substring(i+1)
        variants += word.substring(0, i) + "oo" + word.substring(i+1)
      }
      else if (word(i) == 'u') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "oo" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
      }
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "vowel_dilemma"))
    variants_
  }

  private def vowelDilemmaAndPluralExtend1(word1: String): ArrayBuffer[Array[String]] = {
    val word = word1.concat("s")
    var variants = ArrayBuffer[String]()
    for(i <- 0 until word.length) {
      if (word(i) == 'a') {
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
        variants += word.substring(0, i) + "u" + word.substring(i+1)
      }
      else if (word(i) == 'e') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "i" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
      }
      else if (word(i) == 'i') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "ee" + word.substring(i+1)
        variants += word.substring(0, i) + "ea" + word.substring(i+1)
      }
      else if (word(i) == 'o') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "u" + word.substring(i+1)
        variants += word.substring(0, i) + "oo" + word.substring(i+1)
      }
      else if (word(i) == 'u') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "oo" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
      }
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "vowel_dilemma_p_extend1"))
    variants_
  }

  private def vowelDilemmaAndPluralExtend2(word1: String): ArrayBuffer[Array[String]] = {
    val word = word1.concat("es")
    var variants = ArrayBuffer[String]()
    for(i <- 0 until word.length) {
      if (word(i) == 'a') {
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
        variants += word.substring(0, i) + "u" + word.substring(i+1)
      }
      else if (word(i) == 'e') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "i" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
      }
      else if (word(i) == 'i') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "ee" + word.substring(i+1)
        variants += word.substring(0, i) + "ea" + word.substring(i+1)
      }
      else if (word(i) == 'o') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "e" + word.substring(i+1)
        variants += word.substring(0, i) + "u" + word.substring(i+1)
        variants += word.substring(0, i) + "oo" + word.substring(i+1)
      }
      else if (word(i) == 'u') {
        variants += word.substring(0, i) + "a" + word.substring(i+1)
        variants += word.substring(0, i) + "oo" + word.substring(i+1)
        variants += word.substring(0, i) + "o" + word.substring(i+1)
      }
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "vowel_dilemma_p_extend2"))
    variants_
  }

  private def vowelDilemmaAndSingleMissingLetter(word: String): ArrayBuffer[Array[String]] = {
    val zero = '0'.toInt
    val nine = '9'.toInt
    var variants = ArrayBuffer[String]()
    for (i <- 0 until word.length)
      if (!(zero <= word(i) && word(i) <= nine))
        variants ++= vowelDilemma(word.substring(0, i) + word.substring(i+1)).map(_(0))
    var variants_ = ArrayBuffer[Array[String]]()
    variants.filterNot(_.equals(word)).foreach(variant => variants_ += Array(variant, "vow_dil_single_missing"))
    variants_
  }

  private def slangWiseWrongUsageTest(word: String): Array[String] = {
    Array(word.substring(0, word.length - 2) + "ies", "ies_ending")
  }

  private def slangWiseWrongUsage(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    val n = word.length
    for (i <- 0 until word.length) {
      if (word(i) == 'c') {
        variants += word.substring(0, i) + "k" + word.substring(i+1)
        variants += word.substring(0, i) + "ck" + word.substring(i+1)
        variants += word.substring(0, i) + "s" + word.substring(i+1)
      }
      else if (word(i) == 'd') {
        variants += word.substring(0, i) + "dh" + word.substring(i + 1)
        variants += word.substring(0, i) + "th" + word.substring(i + 1)
      }
      else if (word(i) == 'f') {
        variants += word.substring(0, i) + "ph" + word.substring(i+1)
        variants += word.substring(0, i) + "p" + word.substring(i+1)
      }
      else if (word(i) == 'g') {
        variants += word.substring(0, i) + "j" + word.substring(i+1)
      }
      else if (word(i) == 'i') {
        variants += word.substring(0, i) + "y" + word.substring(i + 1)
      }
      else if (word(i) == 'j') {
        variants += word.substring(0, i) + "z" + word.substring(i+1)
        variants += word.substring(0, i) + "g" + word.substring(i+1)
      }
      else if (word(i) == 'k') {
        variants += word.substring(0, i) + "c" + word.substring(i+1)
        variants += word.substring(0, i) + "ck" + word.substring(i+1)
        variants += word.substring(0, i) + "kh" + word.substring(i+1)
      }
      else if (word(i) == 'p') {
        variants += word.substring(0, i) + "f" + word.substring(i+1)
        variants += word.substring(0, i) + "ph" + word.substring(i+1)
      }
      else if (word(i) == 's') {
        variants += word.substring(0, i) + "c" + word.substring(i+1)
        variants += word.substring(0, i) + "z" + word.substring(i+1)
        variants += word.substring(0, i) + "sh" + word.substring(i+1)
      }
      else if (word(i) == 't') {
        variants += word.substring(0, i) + "th" + word.substring(i+1)
        variants += word.substring(0, i) + "ti" + word.substring(i+1)
        variants += word.substring(0, i) + "te" + word.substring(i+1)
        variants += word.substring(0, i) + "tee" + word.substring(i+1)
        variants += word.substring(0, i) + "tea" + word.substring(i+1)
      }
      else if (word(i) == 'v') {
        variants += word.substring(0, i) + "w" + word.substring(i + 1)
      }
      else if (word(i) == 'w') {
        variants += word.substring(0, i) + "v" + word.substring(i + 1)
      }
      else if (word(i) == 'y') {
        variants += word.substring(0, i) + "i" + word.substring(i + 1)
      }
      else if (word(i) == 'z') {
        variants += word.substring(0, i) + "j" + word.substring(i+1)
        variants += word.substring(0, i) + "s" + word.substring(i+1)
      }

      if (i < n-1 && word(i) == 'c' && word(i+1) == 'k') {
        variants += word.substring(0, i) + "c" + word.substring(i + 2)
        variants += word.substring(0, i) + "k" + word.substring(i + 2)
        variants += word.substring(0, i) + "ch" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 's' && word(i+1) == 'h') {
        variants += word.substring(0, i) + "ch" + word.substring(i + 2)
        variants += word.substring(0, i) + "s" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 'c' && word(i+1) == 'h') {
        variants += word.substring(0, i) + "c" + word.substring(i + 2)
        variants += word.substring(0, i) + "sh" + word.substring(i + 2)
        variants += word.substring(0, i) + "tch" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 't' && word(i+1) == 'h') {
        variants += word.substring(0, i) + "t" + word.substring(i + 2)
        variants += word.substring(0, i) + "d" + word.substring(i + 2)
        variants += word.substring(0, i) + "dh" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 'p' && word(i+1) == 'h') {
        variants += word.substring(0, i) + "f" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 'o' && word(i+1) == 'o') {
        variants += word.substring(0, i) + "u" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 'e' && word(i+1) == 'e') {
        variants += word.substring(0, i) + "i" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 'a' && word(i+1) == 'l') {
        variants += word.substring(0, i) + "le" + word.substring(i + 2)
      }
      else if (i < n-1 && word(i) == 'l' && word(i+1) == 'e') {
        variants += word.substring(0, i) + "al" + word.substring(i + 2)
        variants += word.substring(0, i) + "el" + word.substring(i + 2)
        variants += word.substring(0, i) + "il" + word.substring(i + 2)
      }

      if (i < n-2 && word(i) == 'e' && word(i+1) == 'a' && word(i+2) == 'r') {
        variants += word.substring(0, i) + "are" + word.substring(i + 3)
        variants += word.substring(0, i) + "ere" + word.substring(i + 3)
      }
      else if (i < n-2 && word(i) == 'a' && word(i+1) == 'r' && word(i+2) == 'e') {
        variants += word.substring(0, i) + "ear" + word.substring(i + 3)
        variants += word.substring(0, i) + "ere" + word.substring(i + 3)
      }
      else if (i < n-2 && word(i) == 'e' && word(i+1) == 'r' && word(i+2) == 'e') {
        variants += word.substring(0, i) + "ear" + word.substring(i + 3)
        variants += word.substring(0, i) + "are" + word.substring(i + 3)
      }
    }
    var variants_ = ArrayBuffer[Array[String]]()
    variants.foreach(variant => variants_ += Array(variant, "slangwise_wrong_usage"))
    variants_
  }

  private def adjacentSwap(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for (i <- 1 until word.length) {
      variants += word.substring(0, i - 1) + word(i) + word(i - 1) + word.substring(i + 1)
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.foreach(variant => variants_ += Array(variant, "adjacent_swap"))
    variants_
  }

  private def neighboursSwap(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for (i <- 1 until word.length-1) {
      variants += word.substring(0, i - 1) + word(i + 1) + word(i) + word(i - 1) + word.substring(i + 2)
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.foreach(variant => variants_ += Array(variant, "neighbours_swap"))
    variants_
  }

  private def extraLetters(word: String): ArrayBuffer[Array[String]] = {
    var variants = ArrayBuffer[String]()
    for (i <- 0 until word.length) {
      val res = approxKey.get(word.charAt(i))
      for (x <- res) {
        for (letter <- x) {
          if (0 == i)
            variants += letter + word.substring(0)
          else
            variants += word.substring(0, i) + letter + word.substring(i)

          if (word.length - 1 == i)
            variants += word.substring(0) + letter
          else
            variants += word.substring(0, i+1) + letter + word.substring(i+1)
        }
      }
    }
    variants.toArray
    var variants_ = ArrayBuffer[Array[String]]()
    variants.foreach(variant => variants_ += Array(variant, "extra_letters"))
    variants_
    // Added it as the final priority (18)
  }

import scala.collection.mutable.{ArrayBuffer, HashMap}
import scala.util.Random
generateAllTypoVariants: (word: String)Array[Array[String]]


In [54]:
var df_tq = spark.read.parquet("/data1/searchengine/EmbeddingTrainingDataCuration/ajio/09122024/V2/TemplatedQueries")

df_tq: org.apache.spark.sql.DataFrame = [query: array<string>, query_template: array<string> ... 6 more fields]


In [55]:
df_tq = df_tq.select("query")

df_tq: org.apache.spark.sql.DataFrame = [query: array<string>]


In [56]:
import scala.util.Random

var rand = new Random()

def add_mistake(query_string:String)={
    var mistakes = generateAllTypoVariants(query_string)
    var index = rand.nextInt(mistakes.length)
    mistakes(index)(0)
}

var add_mistake_udf = udf(add_mistake _)

import scala.util.Random
rand: scala.util.Random = scala.util.Random@65dc501c
add_mistake: (query_string: String)String
add_mistake_udf: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$6035/1139164203@331a63c0,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)


In [57]:
df_tq = df_tq.withColumn("query",concat_ws(" ",col("query")))

df_tq: org.apache.spark.sql.DataFrame = [query: string]


In [58]:
df_tq = df_tq.withColumn("mistake_queries",add_mistake_udf(col("query")))

df_tq: org.apache.spark.sql.DataFrame = [query: string, mistake_queries: string]


In [59]:
df_tq = df_tq.filter(length(col("query")) <=25 )

df_tq: org.apache.spark.sql.DataFrame = [query: string, mistake_queries: string]


In [60]:
df_tq = df_tq.sample(fraction=0.5).limit(2000)

df_tq: org.apache.spark.sql.DataFrame = [query: string, mistake_queries: string]


In [61]:
df_tq.show(false)

+-------------------------+--------------------------+
|query                    |mistake_queries           |
+-------------------------+--------------------------+
| satchels handbags       | satchels handugs         |
| floral printed saree    | loral printed sarie      |
|021-black short tops     |021-black shiort tops     |
|612 league casual tops   |612 league caoal tops     |
|9 impression             |9 imprassion              |
|a-line dresses in komarri|a-ine dresses in omarri   |
|a-line dresses in rirasa |a-leene dressesin rirasa  |
|abstract shirts in kotty |abstroct shrts in kotty   |
|abstract ties in cazzano |absftract ties in cazzano |
|acai occasion flat shoes |acai eccasin flat shoes   |
|adamo timewear & eyewear |adam teamewear & eyewear  |
|adidas footwear for women|adidas footwear far womens|
|adobe tshirts for men    |udobe tshirts for mn      |
|afast lighting           |afast lighbting           |
|air garb tshirts for men |air garb tshirts far menes|
|airforce 

In [62]:
df_tq.write.option("header",true).csv("file:////app/notebooks/avinash/SpellCheck-test-data/spell_check_test_data.csv")

In [63]:
// saving to hdfs
df_tq.write.option("header",true).csv("/data1/archive/avinash/SearchTests/TestData/SpellCheck/spell_check_test_data.csv")