In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import *
from pyspark import SparkContext
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, StructField
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import time
import pickle
import pymssql
import pandas as pd
import dask.dataframe as dd
from functools import reduce
import warnings
warnings.filterwarnings('ignore')

import datetime

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, BatchNormalization, Input, concatenate, add, multiply, Reshape, Dropout, Activation, LSTM,RNN 
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import model_from_json


In [None]:
import gensim

import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

In [None]:
%run utils.ipynb

### Word Classification

### 1-) Identifies the Words

##### character-level embedding of words. I

In [None]:
english_words = pd.read_csv("Armut_ML_Case_-_Eng.csv")
english_words["Label"] = "English"
english_words.rename(columns = {"Words":"Vocab"}, inplace = True)

In [None]:
english_words.head()

In [None]:
turkish_words = pd.read_csv("Armut_ML_Case_-_Turkish.csv")
turkish_words["Label"] = "Turkish"
turkish_words.rename(columns = {"Kelimeler":"Vocab"}, inplace = True)

In [None]:
turkish_words.head()

In [None]:
raw_vocab_df = turkish_words.append(english_words).reset_index(drop=True)
raw_vocab_df["Vocab"] = raw_vocab_df["Vocab"].astype(str)

In [None]:
raw_vocab_df.head()

#### Remove whitespace in raw data

In [None]:
raw_vocab_df["IsThereWhiteSpace"] = raw_vocab_df["Vocab"].apply(lambda x: True if ' ' in x else False)

In [None]:
#raw_vocab_df["Vocab"] = raw_vocab_df["Vocab"].apply(lambda x: x.replace(" ", ""))

In [None]:
raw_vocab_df = raw_vocab_df[["Vocab","Label"]]

In [None]:
"""Checking Data Balance"""

print("Toplam Türkçe Kelime Sayısı {} \n".format(len(turkish_words)), "Number of English Words {}".format(len(english_words)))

In [None]:
"""Checking Number Of Unique Words"""

print("Toplam Türkçe Kelime Sayısı {}, Toplam Eşsiz Türkçe Kelime Sayısı {} \n".format(len(turkish_words), len(set(turkish_words["Vocab"]))),
      
      "Number of English Words {}, Number of Unique English Words {}".format(len(english_words), len(set(english_words["Vocab"]))))

In [None]:
max_length_of_turkish_and_english_words = max([max([len(str(kelime)) for kelime in turkish_words["Vocab"]])] +
                                              [max([len(str(word)) for word in english_words["Vocab"]])])

print("max_length_of_turkish_and_english_words = {} ".format(max_length_of_turkish_and_english_words))

### Character Embedding Model

In [None]:
turkish_char_lookup_list = "a,b,c,ç,d,e,f,g,ğ,h,ı,i,j,k,l,m,n,o,ö,p,r,s,ş,t,u,ü,v,y,z".split(",")
english_char_lookup_list = "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z, ".split(",")
char_list = list(set.union(set(turkish_char_lookup_list)).union(set(english_char_lookup_list)))

unknown_char = 'UNKNOWN'
char_list.insert(0, unknown_char)

num_of_char = len(char_list)
char_indexes = dict((y, x) for x, y in enumerate(char_list))
index_chars = dict((x, y) for x, y in enumerate(char_list))

In [None]:
labels = ["Turkish", "English"]

label_indexes = dict((y, x) for x, y in enumerate(labels))
index_labels = dict((x, y) for x, y in enumerate(labels))

In [None]:
label_indexes

#### Check if there is any char in both Turkish and English raw data that is not in char_list

In [None]:
turkish_and_english_unique_letters_set = set()
for word in raw_vocab_df["Vocab"]:
    turkish_and_english_unique_letters_set = turkish_and_english_unique_letters_set.union(set(str(word)))

In [None]:
turkish_and_english_unique_letters_set - set(char_list)

In [None]:
def preprocess_raw_data(raw_vocab_df, label_indexes, max_length_of_vocab):
    
    X_input_data = np.zeros((len(raw_vocab_df), max_length_of_vocab), np.int32)
    y_input_data = []
    
    for word_index, word in raw_vocab_df.iterrows():
        
        for char_index, char in enumerate(word["Vocab"]):
            if char not in char_list:
                X_input_data[word_index, char_index] = char_indexes['UNKNOWN']
            else:
                X_input_data[word_index, char_index] = char_indexes[char]
         
        y_input_data.append(label_indexes[word["Label"]])      
    
    assert X_input_data.shape == (len(X_input_data), max_length_of_vocab)
    
    return X_input_data.reshape(-1, max_length_of_vocab, 1), np.array(y_input_data)

In [None]:
X_input_data, y_input_data = preprocess_raw_data(raw_vocab_df, label_indexes, 
                                            max_length_of_vocab = max_length_of_turkish_and_english_words)

#### Checking shape of Inputs

In [None]:
print("X_input_data shape = {}".format(X_input_data.shape))
print("y_input_data shape = {}".format(y_input_data.shape))

#### Train test split based on class label to make balanced seperation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_input_data, y_input_data, stratify = y_input_data, test_size=0.001)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, test_size=0.2)

In [None]:
print("length of X_train = {}".format(len(X_train)), "length of X_val = {}".format(len(X_val)), "length of X_test = {}".format(len(X_test)))

#### Model Configs

In [None]:
hparams = {"HP_NUM_UNITS":64 ,
           "HP_DROPOUT":0.1 ,
           "HP_OPTIMIZER":"RMSprop"}

verbose = True

In [None]:
def get_model(X_input_data, num_of_unique_chars, embedding_dimension):
    
    word_embedding_input = Input(shape=(X_input_data.shape[1], X_input_data.shape[2]), dtype='int32')
    embedded_input = Embedding(num_of_char, embedding_dimension, input_shape=(X_input_data[1],X_input_data[2]))(word_embedding_input)
    reshaped_embedding = Reshape((X_input_data.shape[1],embedding_dimension), name = "reshape_embedding")(embedded_input)
    
    lstm_layer = LSTM(64, return_sequences=False, return_state=False , dropout=0.1)(reshaped_embedding)
    
    dense_layer = Dense(hparams["HP_NUM_UNITS"], input_dim=lstm_layer.shape[1], kernel_initializer='normal')(lstm_layer)
    dense_layer = BatchNormalization()(dense_layer)
    dense_layer = BatchNormalization()(dense_layer)
    dense_layer = Activation('relu')(dense_layer)
    dense_layer = Dropout(hparams["HP_DROPOUT"])(dense_layer)
    output = Dense(1, activation='sigmoid')(dense_layer)
    
    model = Model(inputs=[word_embedding_input], outputs=[output])
    
    print(model.summary())
    
    return model

In [None]:
model = get_model(X_train, num_of_unique_chars = num_of_char, embedding_dimension = 16)

#### Training

In [None]:
model.compile(optimizer=hparams["HP_OPTIMIZER"], loss='mean_squared_error', metrics=[RootMeanSquaredError(name='rmse')])
        
log_dir_path = "lstm_model/logs"
model_path = "lstm_model"
model_weight_path = "lstm_model/model_weights"
        
tensor_board = TensorBoard(histogram_freq=1, write_graph=True, write_images=False)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
reduce_lr_on_plateau = ReduceLROnPlateau(patience=3)
        
if verbose:
    model.fit([X_train], [y_train], epochs=100, batch_size = 1024,\
verbose=1, shuffle=True, validation_data=([X_val], [y_val]), callbacks=[early_stopping, reduce_lr_on_plateau])
        
else: 
    model_history[forecast_start_date][direction] = model.fit([X_train], [y_train], epochs=100, batch_size = 1024,\
verbose=2, shuffle=True, validation_data=([X_val], [y_val]), callbacks=[early_stopping, reduce_lr_on_plateau])
    

# Saving model parameters..
if not os.path.exists(os.path.join(os.getcwd(),model_path)):
    os.makedirs(os.path.join(os.getcwd(),model_path))
model_json = model.to_json()

with open(os.path.join(os.getcwd(),model_path,"model.json"), "w") as json_file:
          json_file.write(model_json)
    
if not os.path.exists(os.path.join(os.getcwd(),model_weight_path)):
    os.makedirs(os.path.join(os.getcwd(),model_weight_path))
    
# serialize weights to HDF5
model.save_weights(os.path.join(os.path.join(os.getcwd(),model_weight_path, "model_weights.h5")))      

#### Testing

In [None]:
preds = model.predict(X_test)

In [None]:
pred_df = pd.DataFrame({
                        "Predictions": np.vectorize(index_labels.get)(np.round(preds.reshape(-1,))), 
                        "GrandTruth": np.vectorize(index_labels.get)(np.array(y_test))
                       })

In [None]:
pred_df

### Junk Word Classifier

In [None]:
import math
import pickle
from collections import Counter

In [None]:
turkish_char_lookup_list = "a,b,c,ç,d,e,f,g,ğ,h,ı,i,j,k,l,m,n,o,ö,p,r,s,ş,t,u,ü,v,y,z, ".split(",")
english_char_lookup_list = "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z".split(",")
char_list = list(set.union(set(turkish_char_lookup_list)).union(set(english_char_lookup_list)))

char_indices = dict([(char, idx) for idx, char in enumerate(char_list)])

print("char_indices = {}".format(char_indices))

In [None]:
max_length_of_turkish_and_english_words = max([max([len(str(kelime)) for kelime in turkish_words["Vocab"]])] +
                                              [max([len(str(word)) for word in english_words["Vocab"]])])

print("max_length_of_turkish_and_english_words = {} ".format(max_length_of_turkish_and_english_words))

In [None]:
english_words = pd.read_csv("Armut_ML_Case_-_Eng.csv")
english_words.rename(columns = {"Words":"Vocab"}, inplace = True)

english_words.head()

turkish_words = pd.read_csv("Armut_ML_Case_-_Turkish.csv")
turkish_words.rename(columns = {"Kelimeler":"Vocab"}, inplace = True)

turkish_words.head()

raw_vocab_df = turkish_words.append(english_words).reset_index(drop=True)
raw_vocab_df["Vocab"] = raw_vocab_df["Vocab"].astype(str)

raw_vocab_df.head()

In [None]:
X_train, X_test = train_test_split(raw_vocab_df, test_size=0.001)

In [None]:
print("length of X_train = {}".format(len(X_train)), "length of X_test = {}".format(len(X_test)))

In [None]:
def convert_sentence_into_chars(line):

    return [char.lower() for char in line if char.lower() in char_list] 

def ngram(n_gram, word):
    
    n_gram_list = []
    word_char_list = convert_sentence_into_chars(word)

    for start in range(0, len(word_char_list) - n_gram + 1):
        n_gram_list.append((word_char_list[start], word_char_list[start + 1]))
        
    return n_gram_list
        
def avg_transition_prob(l, log_prob_matrix):

    log_prob = 0.0
    number_of_transition = 0
    for first_char, second_char in ngram(2, l):
        log_prob += log_prob_matrix[char_indices[first_char]][char_indices[second_char]]
        number_of_transition += 1

    return math.exp(log_prob / (number_of_transition or 1))
            

In [None]:
def markov_chain_model(word_list):
    
    # we assume that each n_gram split has occur at least once.
    log_prob_matrix = [[1 for i in range(len(char_list))] for i in range(len(char_list))]

    for word in word_list:
        n_gram_list=ngram(2, word)
        for first_char, second_char in n_gram_list:
            log_prob_matrix[char_indices[first_char]][char_indices[second_char]] += 1
        
    # Calculating log probabiities
    for index, row in enumerate(log_prob_matrix):
        total_occurences = float(sum(row))
        for char in range(len(row)):
            row[char] = math.log(row[char] / total_occurences)
            
    # They are selected from Armut_ML_Case_-_Eng.csv and Armut_ML_Case_-_Turkish.csv
    genuine_word_samples = ['aslihan','two models','buraya güzel bir şey yazmak istiyorum','I want to say something','a b c']
    junk_word_samples = ['asdfgh','cvbnmö','zxcvnadtruqe','ertyuıopğü','qwer <>zxcvb']

    # Find the probability of generating a few arbitrarily choosen good and bad phrases.
    good_probs = [avg_transition_prob(line, log_prob_matrix) for line in genuine_word_samples]
    bad_probs = [avg_transition_prob(line, log_prob_matrix) for line in junk_word_samples]

    # And pick a threshold halfway between the worst good and best bad inputs.
    threshold = (min(good_probs) + max(bad_probs)) / 2
    pickle.dump({'log_prob_matrix': log_prob_matrix, 'threshold': threshold}, open('junk_and_genuine_word_classifier_model.pkl', 'wb'))
    

In [None]:
markov_chain_model(X_train)

### Testing

In [None]:
junk_and_genuine_word_classifier_model = pickle.load(open('junk_and_genuine_word_classifier_model.pkl', 'rb'))

log_prob_matrix = junk_and_genuine_word_classifier_model['log_prob_matrix']
threshold = junk_and_genuine_word_classifier_model['threshold']

preds = []

for test_word in X_test["Vocab"]:
    transition_prob = avg_transition_prob(test_word, log_prob_matrix)
    if transition_prob > threshold:
        preds.append(0)
    else:
        preds.append(1)    
        
preds_df = pd.DataFrame({"Predictions":preds, "GrandTruth": X_test["Vocab"]})

In [None]:
preds_df 

### Simulation (Brownian Motion)

In [None]:
"""
Bt2 − Bt1 = N(0,t2 − t1 ), where N(0,t2 − t1) is a normal distribution with variance t2 − t1
"""

In [None]:
import random
def brownianMotion(timePoints):
    
    x_start_point = 1
    y_start_poin = 1
    
    brownianTrajectory = [(x_start_point, y_start_poin)]
    
    for t in range(len(timePoints)-1):
        random_number_x = random.gauss(0, timePoints[t+1]-timePoints[t])
        random_number_y= random.gauss(0, timePoints[t+1]-timePoints[t])        
        brownianTrajectory.append((brownianTrajectory[t][0] + random_number_x, brownianTrajectory[t][1] + random_number_y))

    return brownianTrajectory

In [None]:
timePoints = [time for time in np.arange(1,100,0.01)]

In [None]:
brownianTrajectory = brownianMotion(timePoints)