# Load data

In [None]:
import pandas as pd

In [None]:
from pathlib import Path

def load_data(path: str) -> pd.DataFrame:
    data = pd.read_csv(Path(path))
    return data

In [None]:
data = load_data("../data/data.csv")

In [None]:
data.info()

# Preprocessing

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
from sentence_transformers import SentenceTransformer

# intfloat/e5-small-v2 or intfloat/e5-base-v2 or intfloat/e5-large-v2
model = SentenceTransformer("intfloat/e5-small-v2")

In [None]:
embeddings = model.encode(data['text'], normalize_embeddings=True, show_progress_bar=True)

In [None]:
nb_comp = len(embeddings[0])
nb_data = len(embeddings)
print(f"embedding size : {nb_comp}")

In [None]:
for num_comp in range(nb_comp):
    data['x_' + str(num_comp)] = [embeddings[k][num_comp] for k in range(nb_data)]

In [None]:
data.head(10)

In [None]:
data_keywords = data['keyword']

In [None]:
data_keywords.size

In [None]:
data_keywords.value_counts()

In [None]:
columns = ['keyword']
data_dummy = pd.get_dummies(data, columns=columns, drop_first=True)
data_dummy.replace({False: 0, True: 1}, inplace=True)

In [None]:
data_dummy.head(10)

In [None]:
data_dummy.info()

In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="my_app")
data_dummy_wNan = data_dummy.dropna()

def get_coordinates(location):
    try:
        location = geolocator.geocode(location)
        return location.latitude, location.longitude
    except:
        return None

data_tmp = data_dummy_wNan.head(10)
for location in data_tmp['location']:
    print(location, get_coordinates(location))
# data_tmp['coordinates'] = data1['location'].apply(get_coordinates)


In [None]:
# data_dummy_wNan.apply(lambda x: get_coordinates(x['location']), axis=1)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# data_dummy_wNan = data_dummy.dropna()
# corr = data_dummy_wNan.corr()
# sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5)


# Training

## K-NN

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# FIXME : use location
input = data_dummy.drop(columns=['text', 'target', 'location'])
output = data_dummy['target']

In [None]:
input.head(5)

In [None]:
input_train, input_test, output_train, output_test = train_test_split(input, output, test_size=0.2, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier(n_neighbors=5)

In [None]:
clf.fit(input_train, output_train)
print(f"train accuracy : {clf.score(input_train, output_train)}")
print(f"test accuracy : {clf.score(input_test, output_test)}")

In [None]:
grid_neighborns = {'n_neighbors' : [k for k in range(5, 100, 5)]}


In [None]:
from tqdm import tqdm

training_accuracy = []
test_accuracy = []

for n_neighbors in tqdm(grid_neighborns):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(input_train, output_train)
    training_accuracy.append(clf.score(input_train, output_train))
    test_accuracy.append(clf.score(input_test, output_test))

In [None]:
import matplotlib.pyplot as plt

plt.plot(grid_neighborns, training_accuracy, label="Training accuracy")
plt.plot(grid_neighborns, test_accuracy, label="Test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [None]:
grid_search = GridSearchCV(KNeighborsClassifier(), grid_neighborns, cv=10, n_jobs=-1)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
grid_search.fit(input_train, output_train)

In [None]:
print("Test set score: {:.2f}".format(grid_search.score(input_test, output_test)))

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))