# Competition Description
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).\
Data from : https://www.kaggle.com/c/nlp-getting-started/data


In [18]:
#imports
import spacy
from spacy.util import minibatch
from spacy.training.example import Example
from spacy.matcher import PhraseMatcher
import numpy as np
import seaborn as sns
import pandas as pd
import random
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


# Choices on what to do next
- Simplest: only use text and train a text categorizer model
- See if we can use the location along with text to model, plot locations with folium and see if there is a relation visually
- Keyword+ Text , since keyword has fewer NAN values, we  can drop the NAN rows and train the textcat
- Toughest : use all together

## Lets try the simplest way first

In [None]:
train.keyword.isna().sum()

61

In [8]:
train.location.isna().sum()


2533

In [9]:
len(train.index)

7613

In [17]:
nlp = spacy.blank('en')
textcat = nlp.add_pipe('textcat')
textcat.add_label('POSITIVE')
textcat.add_label('NEGATIVE')
train_texts = train['text'].values
train_labels= [{'cats': {'POSITIVE': label == 1,
                          'NEGATIVE': label == 0}} 
                for label in train['target']]
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
  {'cats': {'POSITIVE': True, 'NEGATIVE': False}}),
 ('Forest fire near La Ronge Sask. Canada',
  {'cats': {'POSITIVE': True, 'NEGATIVE': False}}),
 ("All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
  {'cats': {'POSITIVE': True, 'NEGATIVE': False}})]

In [19]:
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 3353.3135722425004}
{'textcat': 5880.140361419722}
{'textcat': 7831.173640969229}
{'textcat': 9316.02437401283}
{'textcat': 10358.054948272966}
{'textcat': 11150.57298633356}
{'textcat': 11773.657200593458}
{'textcat': 12282.492342691965}
{'textcat': 12713.695225671869}
{'textcat': 13139.337311646255}


In [20]:
def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model (using your predict method)
    predicted_class = predict(model,texts)
    
    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    true_class = [int(each['cats']['POSITIVE']) for each in labels]
    
    # A boolean or int array indicating correct predictions
    correct_predictions = predicted_class == true_class
    
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = correct_predictions.mean()
    
    return accuracy




In [24]:
def predict(nlp, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores = textcat.predict(docs)
    # From the scores, find the class with the highest score/probability
    return scores


In [25]:
test = pd.read_csv('test.csv')

In [26]:
predictions = predict(nlp, test['text'])


In [28]:
predictions

array([[9.9800354e-01, 1.9964848e-03],
       [9.9998617e-01, 1.3815589e-05],
       [8.2105666e-01, 1.7894329e-01],
       ...,
       [9.4556165e-01, 5.4438304e-02],
       [9.9914706e-01, 8.5290225e-04],
       [9.9983609e-01, 1.6384826e-04]], dtype=float32)

In [29]:
target=[]
for value1, value2 in predictions:
    if value1>value2:
        target.append(1)
    else:
        target.append(0)

In [30]:
submission_data = {'id': test.id,'target':target}
submission_df = pd.DataFrame(submission_data)
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [32]:
submission_df.to_csv('submission.csv', index = False)

## Got a score of 0.74134 on kaggle , so now lets try to improve it