In [1]:
import pandas as pd

# Data

In [2]:
def read_bioes_file(file_path):
    sentences, labels = [], []
    with open(file_path, 'r') as file:
        sentence, label = [], []
        for line in file:
            if line.strip():
                word, tag = line.strip().split()
                sentence.append(word)
                label.append(tag)
            else:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
    return sentences, labels

train_sentences, train_labels = read_bioes_file('/kaggle/input/zindi-learn-location-mention-recognition-challenge/merged_dev_bioes.txt')
dev_sentences, dev_labels = read_bioes_file('/kaggle/input/zindi-learn-location-mention-recognition-challenge/merged_train_bioes.txt')

In [3]:
print("Train sentences: ", len(train_sentences))
print("Train labels: ", len(train_labels))
print("Dev sentences: ", len(dev_sentences))
print("Dev labels: ", len(dev_labels))
print()

print("Train sentences: \n", train_sentences[0])
print("Train labels: \n", train_labels[0])
print("Dev sentences: \n", dev_sentences[0])
print("Dev labels: \n", dev_labels[0])

Train sentences:  2056
Train labels:  2056
Dev sentences:  14392
Dev labels:  14392

Train sentences: 
 ['If', 'you', 're', 'looking', 'for', 'legitimate', 'relief', 'organizations', 'to', 'help', 'those', 'affected', 'by', 'the', 'CA', 'fires', ',', 'I', 'found', 'this', 'link', ':', 'How', 'to', 'Help', 'Those', 'Affected', 'by', 'California', 'Wildfires', '-', 'Consumer', 'Reports']
Train labels: 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-LOC', 'O', 'O', 'O', 'O']
Dev sentences: 
 ['Please', 'read', 'below', '!', '!', 'Another', 'devastating', 'fire', 'has', 'hit', 'Northern', 'California', ',', 'people', 'need', 'help', ',', 'whatever', 'you', 'can', 'give', ',', 'or', 'anyway', 'you', 'can', 'help', ',', 'please', 'doὤF', '!', '!']
Dev labels: 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'E-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

# Set up the label mapping

In [4]:
# Set up label mapping
all_labels = set()

for labels in train_labels + dev_labels:
    all_labels.update(labels)

label_list = sorted(list(all_labels))

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print(label2id)
print(id2label)

{'B-LOC': 0, 'E-LOC': 1, 'I-LOC': 2, 'O': 3, 'S-LOC': 4}
{0: 'B-LOC', 1: 'E-LOC', 2: 'I-LOC', 3: 'O', 4: 'S-LOC'}


In [5]:
# Convert train_labels to IDs
train_labels_ids = [[label2id[label] for label in sentence_labels] for sentence_labels in train_labels]
dev_labels_ids = [[label2id[label] for label in sentence_labels] for sentence_labels in dev_labels]

# Example usage
print("Original first sentence labels:", train_labels[0])
print("Converted first sentence label IDs:", train_labels_ids[0])

print("Valid length: ", len(train_labels) == len(train_labels_ids))

Original first sentence labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-LOC', 'O', 'O', 'O', 'O']
Converted first sentence label IDs: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3]
Valid length:  True


### Preprocess sentence and label from BIOES to index

In [6]:
import re
import unicodedata

def preprocess_sentence_and_labels(sentence, labels):
    processed_sentence = []
    processed_labels = []

    for word, label in zip(sentence, labels):
        # Remove words with special characters or numbers
        if not re.match(r'^[a-zA-Z]+$', word):
            continue

        # If the word is not empty after processing, keep it and its label
        if word:
            processed_sentence.append(word)
            processed_labels.append(label)

    return processed_sentence, processed_labels

# Process the training data
processed_train_sentences = []
processed_train_labels_ids = []

# Process the training data
processed_dev_sentences = []
processed_dev_labels_ids = []

for sentence, labels in zip(train_sentences, train_labels_ids):
    proc_sentence, proc_labels = preprocess_sentence_and_labels(sentence, labels)
    processed_train_sentences.append(proc_sentence)
    processed_train_labels_ids.append(proc_labels)

for sentence, labels in zip(dev_sentences, dev_labels_ids):
    proc_sentence, proc_labels = preprocess_sentence_and_labels(sentence, labels)
    processed_dev_sentences.append(proc_sentence)
    processed_dev_labels_ids.append(proc_labels)

In [7]:
# Print an example to compare
print("Original sentence:", train_sentences[2])
print("Original labels:", train_labels_ids[2])
print("\nProcessed sentence:", processed_train_sentences[2])
print(len(processed_train_sentences[2]))
print("Processed labels:", processed_train_labels_ids[2])
print(len(processed_train_labels_ids[2]))

# Print some statistics
original_word_count = sum(len(sentence) for sentence in train_sentences)
processed_word_count = sum(len(sentence) for sentence in processed_train_sentences)
print(f"\nOriginal word count: {original_word_count}")
print(f"Processed word count: {processed_word_count}")
print(f"Removed {original_word_count - processed_word_count} words")

Original sentence: ['Officials', 'say', '563', 'people', 'are', 'still', 'unaccounted', 'for', 'and', 'at', 'least', '84', 'people', 'have', 'died', '.']
Original labels: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

Processed sentence: ['Officials', 'say', 'people', 'are', 'still', 'unaccounted', 'for', 'and', 'at', 'least', 'people', 'have', 'died']
13
Processed labels: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
13

Original word count: 52038
Processed word count: 40840
Removed 11198 words


In [8]:
from typing import List, Tuple
from transformers import AutoTokenizer

CT_M3_Complete_tokenizer = AutoTokenizer.from_pretrained("crisistransformers/CT-M3-Complete")

def tokenize_and_adjust_labels(sentence: List[str], labels: List[int], tokenizer) -> Tuple[List[int], List[int]]:
    tokenized_input = tokenizer(sentence, is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

    updated_labels = []
    current_label_idx = 0

    for token in tokens:
        if token in ['<s>', '</s>', '<unk>']:
            updated_labels.append(-100)
        elif token.endswith('@@'):  # Handle word pieces ending with @@
            updated_labels.append(labels[current_label_idx])
        else:
            updated_labels.append(labels[current_label_idx])
            current_label_idx += 1

    return tokenized_input["input_ids"], updated_labels

# Apply the function to all sentences and labels
tokenized_train_inputs = []
adjusted_train_labels = []

tokenized_dev_inputs = []
adjusted_dev_labels = []

for sentence, labels in zip(processed_train_sentences, processed_train_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer)
    tokenized_train_inputs.append(input_ids)
    adjusted_train_labels.append(adjusted_labels)

for sentence, labels in zip(processed_dev_sentences, processed_dev_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer)
    tokenized_dev_inputs.append(input_ids)
    adjusted_dev_labels.append(adjusted_labels)

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]



In [9]:
# Print an example to verify
print("Original sentence:", processed_train_sentences[2])
print("Original labels:", processed_train_labels_ids[2])
print("\nTokenized input:", tokenized_train_inputs[2])
print("Adjusted labels:", adjusted_train_labels[2])

# Verify lengths
print("\nLength of tokenized input:", len(tokenized_train_inputs[2]))
print("Length of adjusted labels:", len(adjusted_train_labels[2]))

# Print some statistics
original_sentence_count = len(input_ids)
tokenized_sentence_count = len(tokenized_train_inputs)
print(f"\nNumber of original sentences: {original_sentence_count}")
print(f"Number of tokenized sentences: {tokenized_sentence_count}")

average_original_length = sum(len(s) for s in processed_train_sentences) / original_sentence_count
average_tokenized_length = sum(len(s) for s in tokenized_train_inputs) / tokenized_sentence_count
print(f"\nAverage original sentence length: {average_original_length:.2f}")
print(f"Average tokenized sentence length: {average_tokenized_length:.2f}")

Original sentence: ['Officials', 'say', 'people', 'are', 'still', 'unaccounted', 'for', 'and', 'at', 'least', 'people', 'have', 'died']
Original labels: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

Tokenized input: [0, 18164, 140, 83, 41, 135, 439, 47105, 19, 13, 35, 538, 83, 36, 1318, 2]
Adjusted labels: [-100, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -100]

Length of tokenized input: 16
Length of adjusted labels: 16

Number of original sentences: 23
Number of tokenized sentences: 2056

Average original sentence length: 1775.65
Average tokenized sentence length: 25.12


### Load the dataset

In [10]:
from datasets import Dataset

# Convert to datasets
tokenized_train = Dataset.from_dict({
    "input_ids": tokenized_train_inputs,
    "labels": adjusted_train_labels
})
tokenized_dev = Dataset.from_dict({
    "input_ids": tokenized_dev_inputs,
    "labels": adjusted_dev_labels
})

# Set up label mapping
all_labels = set()

for labels in train_labels + dev_labels:
    all_labels.update(labels)

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

labels = sorted(list(all_labels))

label_list = sorted(list(all_labels))

# Model

### Model configuration

In [11]:
from transformers import AutoConfig, AutoModel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForMaskedLM

model_name = "crisistransformers/CT-M3-Complete"

# Update model configuration
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_list)
config.id2label = id2label
config.label2id = label2id

CT_M3_Complete_model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)
CT_M3_Complete_tokenizer = AutoTokenizer.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at crisistransformers/CT-M3-Complete and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = precision_recall_fscore_support(sum(true_labels, []), sum(true_predictions, []), average='weighted')
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2],
    }

### Training

In [13]:
import numpy as np
import pandas as pd

import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set up model and tokenizer
model_name = "crisistransformers/CT-M3-Complete"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_list)
config.id2label = id2label
config.label2id = label2id

Using device: cuda


In [14]:
# Set up data collator
data_collator = DataCollatorForTokenClassification(tokenizer=CT_M3_Complete_tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="content/drive/MyDrive/CrisisTransformers",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",  # Use PyTorch's AdamW implementation
    logging_steps=100,  # Reduce logging frequency
    save_total_limit=2,  # Keep only the last 2 checkpoints
    report_to='none',  # Disable logging to wandb
)

# Set up trainer
trainer = Trainer(
    model=CT_M3_Complete_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=CT_M3_Complete_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [15]:
# Start training
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
0,No log,0.182221,0.944822,0.954226,0.947915
2,0.345600,0.114172,0.973293,0.973112,0.973009
4,0.082500,0.097796,0.975383,0.975086,0.975069
6,0.066400,0.097087,0.975819,0.975969,0.975773
8,0.056400,0.098451,0.975894,0.976066,0.975878
9,0.050400,0.097979,0.975884,0.976012,0.975837


  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device),

TrainOutput(global_step=640, training_loss=0.11569562945514918, metrics={'train_runtime': 1223.61, 'train_samples_per_second': 16.803, 'train_steps_per_second': 0.523, 'total_flos': 521337914313840.0, 'train_loss': 0.11569562945514918, 'epoch': 9.922480620155039})

# Evaluate the model

In [16]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.09797942638397217, 'eval_precision': 0.975884383318524, 'eval_recall': 0.9760115389816247, 'eval_f1': 0.9758373210371024, 'eval_runtime': 94.0334, 'eval_samples_per_second': 153.052, 'eval_steps_per_second': 9.571, 'epoch': 9.922480620155039}


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
import json
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification

# After training
output_dir = "/kaggle/working/results"

# Save the model
trainer.save_model(output_dir)

# Save the tokenizer
CT_M3_Complete_tokenizer.save_pretrained(output_dir)

# Save training arguments
with open(f"{output_dir}/training_args.json", 'w') as f:
    json.dump(training_args.to_dict(), f)

# Save label mappings
with open(f"{output_dir}/label_mappings.json", 'w') as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f)

print(f"Model and associated files saved to {output_dir}")

Model and associated files saved to /kaggle/working/results


In [18]:
# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("/kaggle/working/results")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/results")

# Load label mappings
with open("/kaggle/working/results/label_mappings.json", 'r') as f:
    label_mappings = json.load(f)

id2label = label_mappings["id2label"]
print(id2label)

{'0': 'B-LOC', '1': 'E-LOC', '2': 'I-LOC', '3': 'O', '4': 'S-LOC'}


# Submission

In [19]:
def merge_subwords_and_locations(tokens_and_labels):
    merged_words = []
    merged_labels = []
    current_word = []
    current_labels = []
    location_buffer = []

    for token, label in tokens_and_labels:
        if token.endswith('@@'):
            current_word.append(token[:-2])  # Remove '@@'
            current_labels.append(label)
        else:
            current_word.append(token)
            current_labels.append(label)

            # Merge subwords
            merged_word = ''.join(current_word)

            # Voting for the label
            if len(set(current_labels)) == 1:
                merged_label = current_labels[0]
            else:
                priority_order = ['B-LOC', 'I-LOC', 'E-LOC', 'S-LOC', 'O']
                merged_label = next(label for label in priority_order if label in current_labels)

            # Handle location merging
            if merged_label.endswith('-LOC'):
                if merged_label == 'B-LOC' or merged_label == 'S-LOC':
                    if location_buffer:
                        merged_words.append(' '.join(location_buffer))
                        merged_labels.append('B-LOC')
                        location_buffer = []
                    location_buffer.append(merged_word)
                elif merged_label == 'I-LOC' or merged_label == 'E-LOC':
                    location_buffer.append(merged_word)
                    if merged_label == 'E-LOC':
                        merged_words.append(' '.join(location_buffer))
                        merged_labels.append('B-LOC')
                        location_buffer = []
            else:
                if location_buffer:
                    merged_words.append(' '.join(location_buffer))
                    merged_labels.append('B-LOC')
                    location_buffer = []
                merged_words.append(merged_word)
                merged_labels.append(merged_label)

            # Reset for next word
            current_word = []
            current_labels = []

    # Handle any remaining location in the buffer
    if location_buffer:
        merged_words.append(' '.join(location_buffer))
        merged_labels.append('B-LOC')

    return list(zip(merged_words, merged_labels))

# # Usage
# merged_result = merge_subwords_and_locations(predicted_tokens)

# # Extract locations
# locations = [word for word, label in merged_result if label == 'B-LOC']
# print("\nExtracted locations:", locations)

In [20]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    tokens = []
    predicted_tokens = []

    locations = []
    current_location = []

    for token, prediction in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), predictions[0]):
        if int(prediction) == 0:  # Beginning of a new location
            current_location = [token]
        elif int(prediction) == 2:  # Inside a location
            if current_location:  # Make sure we started a location
                current_location.append(token)
        elif int(prediction) == 1:  # End of a location
            if current_location:  # Make sure we're inside a location
                current_location.append(token)
                locations.append(" ".join(current_location))
                current_location = []
        elif int(prediction) == 4:  # Single token location
            locations.append(token)
        else:
            current_location = []  # Reset if prediction is 'O' or anything else

        # Remove special tokens and clean up the text
        if token not in ['<s>', '</s>', '<unk>']:
            cleaned_token = token[1:] if token.startswith('Ġ') else token

            if token.startswith('##'):
                if predicted_tokens:
                    predicted_tokens[-1] = (predicted_tokens[-1][0] + cleaned_token, predicted_tokens[-1][1])
                continue

            tokens.append(cleaned_token)
            predicted_tokens.append((cleaned_token, id2label[str(prediction.item())]))

    # Usage
    merged_result = merge_subwords_and_locations(predicted_tokens)

    # Extract locations
    locations = [word for word, label in merged_result if label == 'B-LOC']

    # Extract unique locations and sort alphabetically
    unique_locations = sorted(set(locations))

    return unique_locations, tokens, predictions, predicted_tokens

In [21]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

test = pd.read_csv("/kaggle/input/zindi-learn-location-mention-recognition-challenge/Test.csv")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import nltk
nltk.data.path.append('/usr/share/nltk_data/')

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '<URL>', text, flags=re.MULTILINE)

    # Remove user mentions
    text = re.sub(r'@\w+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z0-9\s\./\-_]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

#     # Lemmatize
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    processed_text = ' '.join(tokens)

    return processed_text

# Apply preprocessing to each text in your dataset
test['processed_text'] = test['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
import pandas as pd

submission = []

for index, row in test.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index}")

    id = row['tweet_id']
    processed_text = row['processed_text']

    unique_locations, tokens, predictions, predicted_tokens = predict(processed_text)

    # Join locations with space, or use a single space if no locations
    locations_string = ' '.join(unique_locations) if unique_locations else ' '

    submission.append({'ID': id, 'Locations': locations_string})

# Create DataFrame from submission list
submission_df = pd.DataFrame(submission)

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

Processing row 0
Processing row 100
Processing row 200
Processing row 300
Processing row 400
Processing row 500
Processing row 600
Processing row 700
Processing row 800
Processing row 900
Processing row 1000
Processing row 1100
Processing row 1200
Processing row 1300
Processing row 1400
Processing row 1500
Processing row 1600
Processing row 1700
Processing row 1800
Processing row 1900
Processing row 2000
Processing row 2100
Processing row 2200
Processing row 2300
Processing row 2400
Processing row 2500
Processing row 2600
Processing row 2700
Processing row 2800
Processing row 2900


In [23]:
submission_df

Unnamed: 0,ID,Locations
0,ID_1001154804658286592,New England New Orleans
1,ID_1001155505459486720,ELLICOTT CITY MARYLAND
2,ID_1001155756371136512,Ellicott City Maryland
3,ID_1001159445194399744,Ellicott City Maryland Md
4,ID_1001164907587538944,Ellicott City Maryland
...,...,...
2937,ID_915017703055749120,Mexico
2938,ID_915026957758328832,Las Vegas Mexico
2939,ID_915253441726889984,Calgary Mexico City
2940,ID_915971980859400192,Chiapas Mexicos Oaxaca


In [24]:
!head -n 5 /kaggle/working/submission.csv

  pid, fd = os.forkpty()


ID,Locations
ID_1001154804658286592,New England New Orleans
ID_1001155505459486720,ELLICOTT CITY MARYLAND
ID_1001155756371136512,Ellicott City Maryland
ID_1001159445194399744,Ellicott City Maryland Md
