In [1]:
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time, os


In [2]:

EMBEDDING_PICKLES = [
	"soru_cevap_embeddings.pkl",
	"e5_large_embeddings.pkl"
]

IMGS_DIR = "imgs"

NUM_SEEDS = 5
EPOCHS_MAIN = 200
EPOCHS_MLP_SMALL = 30
EPOCHS_MLP_COMPARE = 50
LEARNING_RATE = 0.01
SIZES = [0.05, 0.1, 0.2, 0.5, 0.8, 1.0]


In [3]:

class actualModel:
	def __init__(self, input_dim, initial_w=None) -> None:
		self.name = "Normal"
		if initial_w is None:
			self.w = np.random.randn(input_dim, 1) * 0.01
		else:
			self.w = initial_w.copy()

	def forward(self, X):
		linear = np.dot(X, self.w)
		return np.tanh(linear)

	def loss(self, y_true, y_pred):
		return np.mean((y_true - y_pred)**2)

	def accuracy(self, y_true, y_pred):
		predictions = np.sign(y_pred)
		return np.mean(predictions == y_true)

	def get_gradients(self, X_batch, y_batch, y_pred):
		N = X_batch.shape[0]
		diff = (y_pred - y_batch)
		dtanh = (1 - y_pred**2)
		grad = np.dot(X_batch.T, diff * dtanh) * 2 / N
		return grad

class TwoLayerMLP:
	def __init__(self, input_dim, hidden_dim=64):
		self.name = f"MLP-{hidden_dim}"
		self.params = {
			'W1': np.random.randn(input_dim, hidden_dim) * 0.05,
			'W2': np.random.randn(hidden_dim, 1) * 0.05
		}
		self.cache = {}

	def forward(self, X):
		z1 = np.dot(X, self.params['W1'])
		a1 = np.tanh(z1)
		z2 = np.dot(a1, self.params['W2'])
		a2 = np.tanh(z2)
		self.cache = {'X': X, 'a1': a1, 'a2': a2}
		return a2

	def get_gradients(self, X, y_true, y_pred):
		N = X.shape[0]
		a1 = self.cache['a1']
		delta2 = 2 * (y_pred - y_true) * (1 - y_pred**2) / N
		grad_W2 = np.dot(a1.T, delta2)
		delta1 = np.dot(delta2, self.params['W2'].T) * (1 - a1**2)
		grad_W1 = np.dot(X.T, delta1)
		return {'W1': grad_W1, 'W2': grad_W2}


In [4]:

class Optimizer:
	def update(self, params, grads): raise NotImplementedError

class SGDOpt(Optimizer):
	def __init__(self, lr=0.01):
		self.lr = lr
	def update(self, params, grads):
		for key in params:
			params[key] -= self.lr * grads[key]

class AdaGradOpt(Optimizer):
    def __init__(self, lr=0.01, epsilon=1e-8):
        self.lr, self.eps = lr, epsilon
        self.G = {}
    def update(self, params, grads):
        for key in params:
            if key not in self.G:
                self.G[key] = np.zeros_like(params[key])
            self.G[key] += grads[key]**2
            params[key] -= self.lr * grads[key] / (np.sqrt(self.G[key]) + self.eps)

class RMSPropOpt(Optimizer):
    def __init__(self, lr=0.001, decay_rate=0.9, epsilon=1e-8):
        self.lr, self.dr, self.eps = lr, decay_rate, epsilon
        self.E_g2 = {}
    def update(self, params, grads):
        for key in params:
            if key not in self.E_g2:
                self.E_g2[key] = np.zeros_like(params[key])
            self.E_g2[key] = self.dr * self.E_g2[key] + (1 - self.dr) * (grads[key]**2)
            params[key] -= self.lr * grads[key] / (np.sqrt(self.E_g2[key]) + self.eps)

class AdamOpt(Optimizer):
	def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
		self.lr, self.beta1, self.beta2, self.eps = lr, beta1, beta2, epsilon
		self.m, self.v, self.t = {}, {}, 0
	def update(self, params, grads):
		self.t += 1
		for key in params:
			if key not in self.m:
				self.m[key] = np.zeros_like(params[key])
				self.v[key] = np.zeros_like(params[key])
			self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
			self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key]**2)
			m_hat = self.m[key] / (1 - self.beta1**self.t)
			v_hat = self.v[key] / (1 - self.beta2**self.t)
			params[key] -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)


In [5]:

def train_generic(model, optimizer, X_tr, y_tr, X_te, y_te, epochs=30, batch_size=32):
	history = {'loss': [], 'acc': [], 'val_loss': [], 'val_acc': []}
	N = X_tr.shape[0]
	if batch_size is None or batch_size >= N:
		batch_size = N
	for _ in range(epochs):
		indices = np.random.permutation(N)
		X_s, y_s = X_tr[indices], y_tr[indices]
		for i in range(0, N, batch_size):
			X_b = X_s[i:i+batch_size]
			y_b = y_s[i:i+batch_size]
			y_p = model.forward(X_b)
			grads = model.get_gradients(X_b, y_b, y_p)
			optimizer.update(model.params, grads)
		train_pred = model.forward(X_tr)
		test_pred = model.forward(X_te)
		history['loss'].append(np.mean((y_tr - train_pred)**2))
		history['acc'].append(np.mean(np.sign(train_pred) == y_tr))
		history['val_loss'].append(np.mean((y_te - test_pred)**2))
		history['val_acc'].append(np.mean(np.sign(test_pred) == y_te))
	return history

def train_gd(model, X, y, X_val, y_val, lr=0.01, epochs=100):
	history = {'loss': [], 'acc': [], 'val_loss': [], 'val_acc': [], 'time': [], 'weights': []}
	m0 = time.time()
	for _ in range(epochs):
		y_pred = model.forward(X)
		grad = model.get_gradients(X, y, y_pred)
		model.w -= lr * grad
		history['loss'].append(model.loss(y, y_pred))
		history['acc'].append(model.accuracy(y, y_pred))
		val_pred = model.forward(X_val)
		history['val_loss'].append(model.loss(y_val, val_pred))
		history['val_acc'].append(model.accuracy(y_val, val_pred))
		history['time'].append(time.time() - m0)
		history['weights'].append(model.w.flatten().copy())
	return history

def train_sgd(model, X, y, X_val, y_val, lr=0.01, epochs=100):
	history = {'loss': [], 'acc': [], 'val_loss': [], 'val_acc': [], 'time': [], 'weights': []}
	m0 = time.time()
	N = X.shape[0]
	for _ in range(epochs):
		idx = np.random.permutation(N)
		for i in idx:
			xi = X[i:i+1]
			yi = y[i:i+1]
			y_pred = model.forward(xi)
			grad = model.get_gradients(xi, yi, y_pred)
			model.w -= lr * grad
		train_pred = model.forward(X)
		val_pred = model.forward(X_val)
		history['loss'].append(model.loss(y, train_pred))
		history['acc'].append(model.accuracy(y, train_pred))
		history['val_loss'].append(model.loss(y_val, val_pred))
		history['val_acc'].append(model.accuracy(y_val, val_pred))
		history['time'].append(time.time() - m0)
		history['weights'].append(model.w.flatten().copy())
	return history

def train_adagrad(model, X, y, X_val, y_val, lr=0.01, epochs=100, eps=1e-8):
    history = {'loss': [], 'acc': [], 'val_loss': [], 'val_acc': [], 'time': [], 'weights': []}
    m0 = time.time()
    G = np.zeros_like(model.w)
    
    for _ in range(epochs):
        y_pred = model.forward(X)
        grad = model.get_gradients(X, y, y_pred)
        
        G += grad**2
        model.w -= lr * grad / (np.sqrt(G) + eps)

        train_pred = model.forward(X)
        val_pred = model.forward(X_val)

        history['loss'].append(model.loss(y, train_pred))
        history['acc'].append(model.accuracy(y, train_pred))
        history['val_loss'].append(model.loss(y_val, val_pred))
        history['val_acc'].append(model.accuracy(y_val, val_pred))
        history['time'].append(time.time() - m0)
        history['weights'].append(model.w.flatten().copy())
    return history

def train_rmsprop(model, X, y, X_val, y_val, lr=0.001, epochs=100, decay_rate=0.9, eps=1e-8):
    history = {'loss': [], 'acc': [], 'val_loss': [], 'val_acc': [], 'time': [], 'weights': []}
    m0 = time.time()
    E_g2 = np.zeros_like(model.w)
    
    for _ in range(epochs):
        y_pred = model.forward(X)
        grad = model.get_gradients(X, y, y_pred)
        
        E_g2 = decay_rate * E_g2 + (1 - decay_rate) * (grad**2)
        model.w -= lr * grad / (np.sqrt(E_g2) + eps)

        train_pred = model.forward(X)
        val_pred = model.forward(X_val)

        history['loss'].append(model.loss(y, train_pred))
        history['acc'].append(model.accuracy(y, train_pred))
        history['val_loss'].append(model.loss(y_val, val_pred))
        history['val_acc'].append(model.accuracy(y_val, val_pred))
        history['time'].append(time.time() - m0)
        history['weights'].append(model.w.flatten().copy())
    return history

def train_adam(model, X, y, X_val, y_val, lr=0.001, epochs=100, beta1=0.9, beta2=0.999, eps=1e-8):
	history = {'loss': [], 'acc': [], 'val_loss': [], 'val_acc': [], 'time': [], 'weights': []}
	m0 = time.time()
	m = np.zeros_like(model.w)
	v = np.zeros_like(model.w)
	t = 0
	for _ in range(epochs):
		t += 1
		y_pred = model.forward(X)
		grad = model.get_gradients(X, y, y_pred)
		m = beta1*m + (1-beta1)*grad
		v = beta2*v + (1-beta2)*(grad**2)
		m_hat = m / (1-beta1**t)
		v_hat = v / (1-beta2**t)
		model.w -= lr * m_hat / (np.sqrt(v_hat) + eps)
		train_pred = model.forward(X)
		val_pred = model.forward(X_val)
		history['loss'].append(model.loss(y, train_pred))
		history['acc'].append(model.accuracy(y, train_pred))
		history['val_loss'].append(model.loss(y_val, val_pred))
		history['val_acc'].append(model.accuracy(y_val, val_pred))
		history['time'].append(time.time() - m0)
		history['weights'].append(model.w.flatten().copy())
	return history


In [6]:

def find_embedding_columns(df):
	q_col = None
	good_col = None
	bad_col = None
	for c in df.columns:
		low = c.lower()
		if 'soru' in low and 'emb' in low:
			q_col = c
		if ('iyi' in low or 'good' in low) and 'emb' in low:
			good_col = c
		if ('kotu' in low or 'bad' in low) and 'emb' in low:
			bad_col = c
			
	if q_col is None and 'Soru_Embedding' in df.columns:
		q_col = 'Soru_Embedding'
	if good_col is None and 'Iyi_Cevap_Embedding' in df.columns:
		good_col = 'Iyi_Cevap_Embedding'
	if bad_col is None and 'Kotu_Cevap_Embedding' in df.columns:
		bad_col = 'Kotu_Cevap_Embedding'
	return q_col, good_col, bad_col


In [7]:

def run_pipeline_for_pickle(pkl_path, epochs_main=EPOCHS_MAIN):
	tag = os.path.splitext(os.path.basename(pkl_path))[0]
	df = pd.read_pickle(pkl_path)
	q_col, good_col, bad_col = find_embedding_columns(df)

	X_list = []
	y_list = []
	for _, row in df.iterrows():
		q_emb = np.array(row[q_col])
		good_emb = np.array(row[good_col])
		pos_input = np.concatenate([q_emb, good_emb])
		X_list.append(pos_input)
		y_list.append(1.0)
		bad_emb = np.array(row[bad_col])
		neg_input = np.concatenate([q_emb, bad_emb])
		X_list.append(neg_input)
		y_list.append(-1.0)

	X = np.array(X_list)
	y = np.array(y_list).reshape(-1, 1)
	ones = np.ones((X.shape[0], 1))
	X = np.concatenate([X, ones], axis=1)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
	input_dim = X_train.shape[1]

	results = {'GD': [], 'SGD': [], 'Adam': [], 'AdaGrad': [], 'RMSProp': []}
	weight_trajectories = []

	for seed_idx in range(NUM_SEEDS):
		initial_w = np.random.randn(input_dim, 1) * 0.05

		model_gd = actualModel(input_dim, initial_w)
		hist_gd = train_gd(model_gd, X_train, y_train, X_test, y_test, lr=LEARNING_RATE, epochs=epochs_main)
		results['GD'].append(hist_gd)
		weight_trajectories.append(("GD", seed_idx, hist_gd['weights']))

		model_sgd = actualModel(input_dim, initial_w)
		hist_sgd = train_sgd(model_sgd, X_train, y_train, X_test, y_test, lr=LEARNING_RATE*0.1, epochs=epochs_main)
		results['SGD'].append(hist_sgd)
		weight_trajectories.append(("SGD", seed_idx, hist_sgd['weights']))

		model_adam = actualModel(input_dim, initial_w)
		hist_adam = train_adam(model_adam, X_train, y_train, X_test, y_test, lr=LEARNING_RATE, epochs=epochs_main)
		results['Adam'].append(hist_adam)
		weight_trajectories.append(("Adam", seed_idx, hist_adam['weights']))

		model_adagrad = actualModel(input_dim, initial_w)
		hist_adagrad = train_adagrad(model_adagrad, X_train, y_train, X_test, y_test, lr=LEARNING_RATE*0.5, epochs=epochs_main)
		results['AdaGrad'].append(hist_adagrad)
		weight_trajectories.append(("AdaGrad", seed_idx, hist_adagrad['weights']))
		
		model_rmsprop = actualModel(input_dim, initial_w)
		hist_rmsprop = train_rmsprop(model_rmsprop, X_train, y_train, X_test, y_test, lr=LEARNING_RATE*0.01, epochs=epochs_main)
		results['RMSProp'].append(hist_rmsprop)
		weight_trajectories.append(("RMSProp", seed_idx, hist_rmsprop['weights']))

	def get_avg_history(algo_results, key):
		return np.mean(np.array([r[key] for r in algo_results]), axis=0)

	algos = ['GD', 'SGD', 'Adam', 'AdaGrad', 'RMSProp']
	colors = {'GD': 'blue', 'SGD': 'orange', 'Adam': 'green', 'AdaGrad': 'red', 'RMSProp': 'purple'}

	fig, axes = plt.subplots(2, 2, figsize=(15, 12))
	ax = axes[0,0]
	for algo in algos:
		ax.plot(get_avg_history(results[algo], 'loss'), label=algo, color=colors[algo])
	ax.set_title("Average Training Loss"); ax.grid(); ax.legend()

	ax = axes[0,1]
	for algo in algos:
		ax.plot(get_avg_history(results[algo], 'acc'), label=algo, color=colors[algo])
	ax.set_title("Average Training Accuracy"); ax.grid(); ax.legend()

	ax = axes[1,0]
	for algo in algos:
		ax.plot(get_avg_history(results[algo], 'val_loss'), label=algo, color=colors[algo])
	ax.set_title("Average Test Loss"); ax.grid(); ax.legend()

	ax = axes[1,1]
	for algo in algos:
		ax.plot(get_avg_history(results[algo], 'val_acc'), label=algo, color=colors[algo])
	ax.set_title("Average Test Accuracy"); ax.grid(); ax.legend()

	plt.tight_layout()
	out_path = os.path.join(IMGS_DIR, f"{tag}_optimization_metrics_combined.png")
	plt.savefig(out_path)
	plt.close(fig)

	flat_weights = []
	for algo, seed, w_list in weight_trajectories:
		for w in w_list:
			flat_weights.append(w)
	flat_weights = np.array(flat_weights)
	perplexity = min(30, flat_weights.shape[0] // 10)
	perplexity = max(5, perplexity)
	tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
	W2D = tsne.fit_transform(flat_weights)

	plt.figure(figsize=(12,10))
	points_per_run = len(weight_trajectories[0][2])
	for idx, (algo, seed, _) in enumerate(weight_trajectories):
		start = idx * points_per_run
		end = start + points_per_run
		pts = W2D[start:end]
		plt.plot(pts[:,0], pts[:,1], color=colors[algo], alpha=0.5)
		plt.scatter(pts[0,0], pts[0,1], color=colors[algo], marker='x', s=50)
		plt.scatter(pts[-1,0], pts[-1,1], color=colors[algo], marker='o', s=50)
	plt.title("t-SNE Optimization Trajectories")
	plt.grid(); plt.legend(algos)
	out_path = os.path.join(IMGS_DIR, f"{tag}_optimization_tsne_trajectories.png")
	plt.savefig(out_path)
	plt.close()

	size_results = []
	for s in SIZES:
		subset = int(len(X_train) * s)
		if subset < 10: continue
		idx = np.random.permutation(len(X_train))
		X_sub = X_train[idx[:subset]]
		y_sub = y_train[idx[:subset]]
		mlp = TwoLayerMLP(input_dim, hidden_dim=32)
		opt = AdamOpt(lr=0.001)
		hist = train_generic(mlp, opt, X_sub, y_sub, X_test, y_test, epochs=EPOCHS_MLP_SMALL)
		size_results.append(hist['val_acc'][-1])
	plt.figure(figsize=(8,5))
	plt.plot([s*100 for s in SIZES[:len(size_results)]], size_results, marker='o')
	plt.title("Dataset Size vs Test Accuracy")
	plt.xlabel("Dataset Percentage")
	plt.ylabel("Accuracy")
	plt.grid()
	out_path = os.path.join(IMGS_DIR, f"{tag}_exp_dataset_size.png")
	plt.savefig(out_path)
	plt.close()

	mlp_big = TwoLayerMLP(input_dim, hidden_dim=64)
	perc = actualModel(input_dim)
	hist_mlp = train_generic(mlp_big, AdamOpt(lr=0.001), X_train, y_train, X_test, y_test, epochs=EPOCHS_MLP_COMPARE)
	hist_perc = train_adam(perc, X_train, y_train, X_test, y_test, lr=0.001, epochs=EPOCHS_MLP_COMPARE)
	plt.figure(figsize=(8,5))
	plt.plot(hist_mlp['val_acc'], label="MLP", color="red")
	plt.plot(hist_perc['val_acc'], label="Normal Model", color="blue")
	plt.title("MLP vs Normal Model Test Accuracy")
	plt.grid()
	plt.legend()
	out_path = os.path.join(IMGS_DIR, f"{tag}_exp_mlp_vs_normal.png")
	plt.savefig(out_path)
	plt.close()

	results_mlp = {'GD': [], 'SGD': [], 'Adam': [], 'AdaGrad': [], 'RMSProp': []}
	num_seeds_comp = 3
	for run in range(num_seeds_comp):
		m_gd = TwoLayerMLP(input_dim, hidden_dim=64)
		h_gd = train_generic(m_gd, SGDOpt(lr=LEARNING_RATE), X_train, y_train, X_test, y_test, epochs=epochs_main, batch_size=None)
		results_mlp['GD'].append(h_gd)
		m_sgd = TwoLayerMLP(input_dim, hidden_dim=64)
		h_sgd = train_generic(m_sgd, SGDOpt(lr=LEARNING_RATE*0.1), X_train, y_train, X_test, y_test, epochs=epochs_main, batch_size=1)
		results_mlp['SGD'].append(h_sgd)
		m_adam = TwoLayerMLP(input_dim, hidden_dim=64)
		h_adam = train_generic(m_adam, AdamOpt(lr=0.001), X_train, y_train, X_test, y_test, epochs=epochs_main, batch_size=32)
		results_mlp['Adam'].append(h_adam)
		m_adagrad = TwoLayerMLP(input_dim, hidden_dim=64)
		h_adagrad = train_generic(m_adagrad, AdaGradOpt(lr=LEARNING_RATE*0.5), X_train, y_train, X_test, y_test, epochs=epochs_main, batch_size=32)
		results_mlp['AdaGrad'].append(h_adagrad)
		m_rmsprop = TwoLayerMLP(input_dim, hidden_dim=64)
		h_rmsprop = train_generic(m_rmsprop, RMSPropOpt(lr=LEARNING_RATE*0.01), X_train, y_train, X_test, y_test, epochs=epochs_main, batch_size=32)
		results_mlp['RMSProp'].append(h_rmsprop)


	fig, axes = plt.subplots(2, 2, figsize=(15, 12))
	metrics = [("loss", "Avg Training Loss"), ("acc", "Avg Training Accuracy"),
				("val_loss", "Avg Test Loss"), ("val_acc", "Avg Test Accuracy")]
	for i, (metric, title) in enumerate(metrics):
		ax = axes[i//2][i%2]
		for algo in algos:
			ax.plot(np.mean(np.array([r[metric] for r in results_mlp[algo]]), axis=0),
					label=f"MLP-{algo}", color=colors[algo])
		ax.set_title(title); ax.grid(); ax.legend()
	plt.tight_layout()
	out_path = os.path.join(IMGS_DIR, f"{tag}_mlp_metrics_combined.png")
	plt.savefig(out_path)
	plt.close()

	return {
		"tag": tag,
		"results_normal": results,
		"results_mlp": results_mlp,
		"avg_val_acc_normal": {a: np.mean(np.array([r['val_acc'] for r in results[a]]), axis=0) for a in algos}
	}


In [8]:

all_runs = []
if not os.path.exists(IMGS_DIR): os.makedirs(IMGS_DIR)
for p in EMBEDDING_PICKLES:
	run_summary = run_pipeline_for_pickle(p, epochs_main=EPOCHS_MAIN)
	all_runs.append(run_summary)
if len(all_runs) >= 2:
	algos = ['GD', 'SGD', 'Adam', 'AdaGrad', 'RMSProp']
	plt.figure(figsize=(15, 4))
	for i, algo in enumerate(algos):
		plt.subplot(1, len(algos), i+1)
		for run in all_runs:
			avg = run['avg_val_acc_normal'][algo]
			label = run['tag']
			plt.plot(avg, label=label)
		plt.title(f"{algo}: Normal Test Accuracy by Embedding Model")
		plt.xlabel("Epoch")
		plt.ylabel("Accuracy")
		plt.grid()
		plt.legend()
	plt.tight_layout()
	plt.savefig(os.path.join(IMGS_DIR, "crossmodel_normal_valacc_comparison.png"))
	plt.close()

	plt.figure(figsize=(15, 5))
	width = 0.15
	x = np.arange(len(algos))
	for i, run in enumerate(all_runs):
		finals = [ run['avg_val_acc_normal'][a][-1] for a in algos ]
		plt.bar(x + i*width, finals, width=width, label=run['tag'])
	plt.xticks(x + width*(len(all_runs)-1)/2, algos)
	plt.ylabel("Final Test Accuracy")
	plt.title("Final Normal Test Accuracy by Optimizer and Embedding Model")
	plt.legend()
	plt.grid(axis='y')
	plt.tight_layout()
	plt.savefig(os.path.join(IMGS_DIR, "crossmodel_normal_final_acc_bars.png"))
	plt.close()