In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Download data from kaggle

In [5]:
! pip install -q kaggle
from google.colab import files
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download geraygench/mountain-ner-dataset
! unzip mountain-ner-dataset.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading mountain-ner-dataset.zip to /content
  0% 0.00/56.9k [00:00<?, ?B/s]
100% 56.9k/56.9k [00:00<00:00, 96.7MB/s]
Archive:  mountain-ner-dataset.zip
  inflating: mountain_dataset_with_markup.csv  


# Preprocesing

In [6]:
df = pd.read_csv('mountain_dataset_with_markup.csv')

In [7]:
df.head()

Unnamed: 0,text,marker
0,A visit to a science museum for hands-on learn...,[]
1,Voice surface coach set democratic time year. ...,[]
2,Parent according maybe activity activity finis...,[]
3,A visit to a sculpture garden with intriguing ...,[]
4,The Julian Alps in Slovenia offer pristine lak...,"[(11, 15)]"


## Make the dependent variable binary

text: "{word} token_of_sentence {context}"

label: is_word_in_markers


In [28]:
df_list = []
for index, row in df.iterrows():
    text = row['text']
    markers = eval(row['marker'])
    text2 = text
    tokens = text.split()
    punctuation = [',', '.',  '!', '?', ';', ':', '(', ')', '[', ']', '{', '}']
    stop_words = ['and', 'or', 'a', 'the', 'in']
    tokens = [t.replace(p, '') for t in tokens if t not in stop_words for p in punctuation]
    for i, token in enumerate(tokens, start=1):
        data = f"{token} token_of_sentence {text}"
        ind = text2.find(token)
        is_token_in_markers = False
        if len(markers):
          is_token_in_markers = any(start <= ind <= end for start, end in markers)
          text2.replace(token, "#"*len(token), 1)
        df_list.append({"text": data, "token":token, "text_id":index, "label": 1 if is_token_in_markers else 0})

new_df = pd.DataFrame(df_list, columns=["text","token", "text_id", "label"])

## Fight against data imbalance

Now we have many similar items and very unbalanced data. So I decided to remove most of it

In [29]:
len(new_df[new_df["label"] == 1]) / len(new_df)

0.019067328918322295

In [30]:
label_0_rows = new_df[new_df['label'] == 0]

percentage_to_remove = 0.98
rows_to_remove = int(len(label_0_rows) * percentage_to_remove)

random_rows_to_remove = label_0_rows.sample(n=rows_to_remove, random_state=42)
new_df = new_df.drop(random_rows_to_remove.index)

In [31]:
len(new_df[new_df["label"] == 1]) / len(new_df)

0.492867332382311

# Train-test split

In [32]:
train_ids, test_ids = train_test_split(new_df["text_id"].unique(), test_size=0.3)

In [33]:
train = new_df[new_df['text_id'].isin(train_ids)]
test = new_df[new_df['text_id'].isin(test_ids)]
print(f"train {len(train)}\ntest {len(test)}")

train 6989
test 2825


In [37]:
train[train["label"]==1]

Unnamed: 0,text,token,text_id,label
624,Alps token_of_sentence The Julian Alps in Slov...,Alps,4,1
625,Alps token_of_sentence The Julian Alps in Slov...,Alps,4,1
626,Alps token_of_sentence The Julian Alps in Slov...,Alps,4,1
627,Alps token_of_sentence The Julian Alps in Slov...,Alps,4,1
628,Alps token_of_sentence The Julian Alps in Slov...,Alps,4,1
...,...,...,...,...
253183,Mountains token_of_sentence Witnessing the mes...,Mountains,1580,1
253184,Mountains token_of_sentence Witnessing the mes...,Mountains,1580,1
253185,Mountains token_of_sentence Witnessing the mes...,Mountains,1580,1
253186,Mountains token_of_sentence Witnessing the mes...,Mountains,1580,1


In [14]:
train.to_csv('/content/drive/MyDrive/Colab Notebooks/task1/train.csv', index=False)
test.to_csv('/content/drive/MyDrive/Colab Notebooks/task1/test.csv', index=False)