# Real or Not? NLP with Disaster Tweets

Kaggle competition. Link: https://www.kaggle.com/c/nlp-getting-started/overview

## Table of Contents

1. [Set-up](#Set-up)
2. [Data Analysis](#Data-Analysis)
2. [Feature Engineering Functions](#Feature-Engineering-Functions)
4. [Data Preparation and Machine Learning](#Data-Preparation-and-Machine-Learning)

## Set-up

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import nltk
# nltk.download('vader_lexicon')

In [3]:
import xgboost as xgb

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)

In [5]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [6]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


## Data Analysis

In [7]:
df_test = train_df.copy()

In [8]:
df_test.isnull().sum()

id          0   
keyword     61  
location    2533
text        0   
target      0   
dtype: int64

In [9]:
print(df_test['target'].value_counts())
print(df_test['target'].value_counts(1))

0    4342
1    3271
Name: target, dtype: int64
0    0.57034
1    0.42966
Name: target, dtype: float64


Let's look at hastags

In [10]:
df_test['hashtag'] = df_test['text'].apply(lambda tweet: '#' in tweet)
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,True
1,4,,,Forest fire near La Ronge Sask. Canada,1,False
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,False
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,True
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,True


In [11]:
print(df_test[df_test['hashtag']]['target'].value_counts(1))
print(df_test[~df_test['hashtag']]['target'].value_counts(1))

0    0.503123
1    0.496877
Name: target, dtype: float64
0    0.590567
1    0.409433
Name: target, dtype: float64


Let's look at if there's any punctuation (right now just . , ' " ; :)

In [12]:
df_test['punctuation'] = df_test['text'].apply(lambda tweet: '.' in tweet or ',' in tweet or "," in tweet or '"' in tweet or ';' in tweet or ':' in tweet)
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,punctuation
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,True,False
1,4,,,Forest fire near La Ronge Sask. Canada,1,False,True
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,False,True
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,True,True
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,True,False


In [13]:
df_test['text'].iloc[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [14]:
print(df_test[df_test['punctuation']]['target'].value_counts(1))
print(df_test[~df_test['punctuation']]['target'].value_counts(1))

0    0.519102
1    0.480898
Name: target, dtype: float64
0    0.756231
1    0.243769
Name: target, dtype: float64


Let's look at some numerical stuff

In [15]:
df_test['tweet length'] = df_test['text'].apply(lambda tweet: len(tweet))
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,punctuation,tweet length
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,True,False,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,False,True,38
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,False,True,133
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,True,True,65
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,True,False,88


In [16]:
df_test['number of words'] = df_test['text'].apply(lambda tweet: len(tweet.split(' ')))
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,punctuation,tweet length,number of words
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,True,False,69,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,False,True,38,7
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,False,True,133,22
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,True,True,65,9
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,True,False,88,17


In [17]:
df_test['number of sentences'] = df_test['text'].apply(lambda tweet: len(tweet.split('.')))
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,punctuation,tweet length,number of words,number of sentences
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,True,False,69,13,1
1,4,,,Forest fire near La Ronge Sask. Canada,1,False,True,38,7,2
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,False,True,133,22,2
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,True,True,65,9,1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,True,False,88,17,1


In [18]:
df_test['text'].iloc[1]

'Forest fire near La Ronge Sask. Canada'

In [19]:
df_test.corr()['target']

id                     0.060781
target                 1.000000
hashtag                0.074486
punctuation            0.197150
tweet length           0.181817
number of words        0.040862
number of sentences    0.156265
Name: target, dtype: float64

In [20]:
df_test.corr()

Unnamed: 0,id,target,hashtag,punctuation,tweet length,number of words,number of sentences
id,1.0,0.060781,-0.008026,0.012285,0.017393,0.005267,0.018247
target,0.060781,1.0,0.074486,0.19715,0.181817,0.040862,0.156265
hashtag,-0.008026,0.074486,1.0,0.12603,0.208775,0.055172,0.070796
punctuation,0.012285,0.19715,0.12603,1.0,0.40342,0.16868,0.517039
tweet length,0.017393,0.181817,0.208775,0.40342,1.0,0.831328,0.401347
number of words,0.005267,0.040862,0.055172,0.16868,0.831328,1.0,0.223232
number of sentences,0.018247,0.156265,0.070796,0.517039,0.401347,0.223232,1.0


In [35]:
# df_test[df_test['target']==0]['text'].sample(100)

In [36]:
# df_test[df_test['target']==1]['text'].sample(100)

## Feature Engineering Functions

In [23]:
def add_substring_in_tweet_column(substring, df):
    """Function will add new column to df.
    New column will be of boolean type: True if substring is in that row's tweet, False otherwise.
    Note: if substring is all lowercase, will assume case does not matter and so will search if substring is in
    lowercase version of tweet. If case matters, then substring must contain at least one capital. This is an assumption."""
    
    new_column_name = "'" + substring + "' in tweet?" # can tell if case matters if substring has capitals
    match_case = not substring == substring.lower()
    
    if match_case:
        df[new_column_name] = df['text'].apply(lambda tweet: substring in tweet)
    
    else:
        df[new_column_name] = df['text'].apply(lambda tweet: substring in tweet.lower())
        
# add_substring_in_tweet_column('.', df_test)
# add_substring_in_tweet_column('our', df_test)
# add_substring_in_tweet_column('fire', df_test)
# display(df_test.head())

In [24]:
def add_sentiment_analysis_column(df, method):
    """Takes in a dataframe, df, and adds sentiment analysis columns to it using a specified method.
    If no correct method is specified, nothing will be added to the dataframe.
    Columns add are: 'neg', 'neu', 'pos' and 'compound'.
    Possible methods: nltk_vader
    """
    
    if method == 'nltk_vader':
        sid = SentimentIntensityAnalyzer()
        df['sentiment'] = df['text'].apply(sid.polarity_scores)
        df['neg'] = df['sentiment'].apply(lambda sent: sent['neg'])
        df['neu'] = df['sentiment'].apply(lambda sent: sent['neu'])
        df['pos'] = df['sentiment'].apply(lambda sent: sent['pos'])
        df['compound'] = df['sentiment'].apply(lambda sent: sent['compound'])
        df.drop('sentiment', axis=1, inplace=True)
        
    else:
        print('No valid method entered - no new columns will be added to the dataframe')

In [165]:
def add_spacy_and_other_numeric_columns(df):
    """Adds columns to a dataframe, df, that use the library spacy.
    Since spacy provides some interesting numeric results which will be added as columns,
    this function will also add the other numeric columns as well, since they may depend on these results."""
    
    # First, create a doc column for each tweet (i.e. tokenize each tweet)
    #df['doc'] = df['text'].apply(lambda tweet: nlp(tweet))
    generator = nlp.pipe(df['text'])
    spacy_dict = {'number of tokens':[], 'number of sentences':[], 'number of words':[],
                  'number of nouns':[], 'number of verbs':[], 'number of adjectives':[],
                  'number of adverbs':[], 'number of pronouns':[], 'number of punctuation':[]}
    
    for doc in generator:
        # Number of tokens (not sure if it's that useful)
        spacy_dict['number of tokens'].append(len(doc))
        
        # Number of sentences
        num_sentences = len([sentence for sentence in doc.sents])
        spacy_dict['number of sentences'].append(num_sentences)
        
        # Number of words (including stop words)
        num_words = len([token.text for token in doc if token.is_punct != True])
        spacy_dict['number of words'].append(num_words)

        # Number of nouns (use first line if don't want stop words or punctuations)
        #num_nouns = len([token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN"])
        num_nouns = len([token.text for token in doc if token.pos_ == "NOUN"])
        spacy_dict['number of nouns'].append(num_nouns)
        
        # Number of verbs
        num_verbs = len([token.text for token in doc if token.pos_ == "VERB"])
        spacy_dict['number of verbs'].append(num_verbs)
        
        # Number of adjectives
        num_adj = len([token.text for token in doc if token.pos_ == "ADJ"])
        spacy_dict['number of adjectives'].append(num_adj)
        
        # Number of adverbs
        num_adverbs = len([token.text for token in doc if token.pos_ == "ADV"])
        spacy_dict['number of adverbs'].append(num_adverbs)
        
        # Number of pronouns
        num_pronouns = len([token.text for token in doc if token.pos_ == "PRON"])
        spacy_dict['number of pronouns'].append(num_pronouns)
        
        # Number of punctuation symbols used 
        # (Note: not sure if hastags are included - first tweet hashtag is considered a noun)
        # Could maybe use is_punct instead??
        num_punct = len([token.text for token in doc if token.pos_ == "PUNCT"])
        spacy_dict['number of punctuation'].append(num_punct)
        
    
    spacy_df = pd.DataFrame(data=spacy_dict)
    df = pd.concat([df, spacy_df], axis=1)
    
    
    df['tweet length'] = df['text'].apply(lambda tweet: len(tweet))
    df['word length (including whitespace)'] = df['tweet length'] / df['number of words']
    df['word per sentence'] = df['number of words'] / df['number of sentences']
    df['total hastags'] = df['text'].apply(lambda tweet: len(re.findall('#', tweet)))
    
    return df

In [167]:
# %%time
# df_test = train_df.copy()
# df_test = add_spacy_and_other_numeric_columns(df_test)
# display(df_test.head(3))
# display(df_test.corr()['target'])

In [168]:
# test_tweet = train_df['text'].iloc[0]
# print(test_tweet)
# test_doc = nlp(test_tweet)
# for token in test_doc:
#     print(token.text, token.pos_, token.tag_)
# print([token.text for token in test_doc if token.is_space == True])

## Data Preparation and Machine Learning

In [169]:
cols_to_drop = ['id', 'keyword', 'location', 'text']
substring_list = ['#', '.', ',', ':', "'", 'http']
spacy_and_num_cols = True
sentiment_analysis_method = 'nltk_vader'
encode_keyword = False
vectorize = False # very slow
spacy_cols = True

ml_model = 'ridge'
nearest_neighbors = 1

In [173]:
def prepare_dataframe_for_machine_learning(dataframe=train_df):
    """Prepares a dataframe for machine learning.
    Based on constants specified in the cell above, this function may add columns to the dataframe based on:
    keyword search, numeric calculations, sentiment analysis, one hot encoding the keywords,
    and/or vectorizing the words.
    It wil then drop columns not needed for machine learning.
    """
    
    # Prepare a copy, so that experimenting is easy
    df = dataframe.copy()
    
    # Add substring columns
    for substring in substring_list:
        add_substring_in_tweet_column(substring, df)
        
    # Add spacy and numeric columns, if true
    if spacy_cols:
        df = add_spacy_and_other_numeric_columns(df)
    
    # Add sentiment analysis
    add_sentiment_analysis_column(df, method=sentiment_analysis_method)
    
    # Encode keyword, if true
    if encode_keyword:
        encoded_keys = pd.get_dummies(df['keyword'])
        df = pd.concat([df, encoded_keys], axis=1)
        
    # Vectorize tweets, if true
    if vectorize:
        count_vectorizer = feature_extraction.text.CountVectorizer()
        vectors = count_vectorizer.fit_transform(df["text"])
        vectors = pd.DataFrame(vectors.todense())
        df = pd.concat([df, vectors], axis=1)
    
    
    # Drop columns not wanted for machine learning
    df.drop(cols_to_drop, axis=1, inplace=True)
    
    # Return prepared dataframe
    return df

prepared_df = prepare_dataframe_for_machine_learning(train_df)
display(prepared_df.head())
display(prepared_df.corr()['target'])

Unnamed: 0,target,'#' in tweet?,'.' in tweet?,"',' in tweet?",':' in tweet?,''' in tweet?,'http' in tweet?,number of tokens,number of sentences,number of words,number of nouns,number of verbs,number of adjectives,number of adverbs,number of pronouns,number of punctuation,tweet length,word length (including whitespace),word per sentence,total hastags,neg,neu,pos,compound
0,1,True,False,False,False,False,False,14,2,13,4,3,0,0,1,0,69,5.307692,6.5,1,0.0,0.851,0.149,0.2732
1,1,False,True,False,False,False,False,8,2,7,2,0,0,0,0,1,38,5.428571,3.5,0,0.286,0.714,0.0,-0.34
2,1,False,True,False,False,True,False,25,2,22,7,4,1,0,0,3,133,6.045455,11.0,0,0.095,0.905,0.0,-0.296
3,1,True,False,True,False,False,False,9,1,8,5,1,0,0,0,0,65,8.125,8.0,1,0.0,1.0,0.0,0.0
4,1,True,False,False,False,False,False,18,1,16,4,3,0,1,0,0,88,5.5,16.0,2,0.0,1.0,0.0,0.0


target                                1.000000
'#' in tweet?                         0.074486
'.' in tweet?                         0.200136
',' in tweet?                         0.018677
':' in tweet?                         0.254143
''' in tweet?                        -0.107381
'http' in tweet?                      0.247440
number of tokens                      0.023198
number of sentences                  -0.032496
number of words                       0.031810
number of nouns                       0.163698
number of verbs                      -0.067857
number of adjectives                  0.032572
number of adverbs                    -0.119653
number of pronouns                   -0.238146
number of punctuation                -0.039848
tweet length                          0.181817
word length (including whitespace)    0.182174
word per sentence                     0.067913
total hastags                         0.058115
neg                                   0.124244
neu          

In [None]:
def evaluate_model_and_prepare_file(prepared_dataframe=prepared_df, ml_model=ml_model, submission=False, 
                                    train_df=train_df, test_df=test_df, nearest_neighbors=None):
    """Takes in a prepared dataframe and a machine learning model, and returns the F1 and accuracy 
    cross-validation scores for when that model is fit to the prepared dataframe. 
    (Uses a prepared dataframe to save time)
    
    If submission is set to true, it will fit the model and prepare the file to submit in the format 
    required by the contest (scores will not be returned). This will use the original, not prepared
    dataframes train_df and test_df.
    
    If no correct model is specified, no scores will be returned.
    
    Possible ml_models: ridge (Ridge Classifier), knn (K Nearest Neighbors), dt (Decision Tree), xgb (XGBoost)
    nb (Naive Bayes - all values must be non-negative)
    """
    
    # Set up machine learning model based on ml_model
    
    if ml_model == 'ridge':
        clf = linear_model.RidgeClassifier()
    
    if ml_model == 'knn':
        clf = KNeighborsClassifier(n_neighbors=nearest_neighbors)
        
    if ml_model == 'dt':
        clf = DecisionTreeClassifier()
        
    if ml_model == 'nb':
        clf = MultinomialNB()
    
    if ml_model == 'xgb':
#         params = {'objective': 'binary:logistic', 'max_depth': 2, 'learning_rate': 1.0, 
#                   'silent': True, 'n_estimators': 5}
#         clf = XGBClassifier(**params)
        clf = XGBClassifier()
    
    
    try:    
        
        if submission:
            # Prepare file for submission
            
            # Make a copy of the dataframes
            train = train_df.copy()
            test = test_df.copy()

            # Prepare the dataframes for machine learning
            prepared_train = prepare_dataframe_for_machine_learning(train)
            prepared_test = prepare_dataframe_for_machine_learning(test)

            # Perform machine learning and save the submission file
            clf.fit(prepared_train.drop("target", axis=1), prepared_train["target"])
            sample_submission = pd.read_csv("../data/sample_submission.csv")
            sample_submission["target"] = clf.predict(prepared_test)
            sample_submission.to_csv("../submission.csv", index=False)
        
        else: 
            # Return f1 scores
            df = prepared_dataframe.copy()
            f1_scores = model_selection.cross_val_score(clf, df.drop("target", axis=1),
                                                        df["target"], cv=3, scoring="f1")
#             accuracy_scores = model_selection.cross_val_score(clf, df.drop("target", axis=1),
#                                                               df["target"], cv=3, scoring="accuracy")
            print('F1 Scores:')
            print(f1_scores)
#             print()
#             print('Accuracy Scores:')
#             print(accuracy_scores)  
    
    except:
        print('No valid model entered!')
        

        
for model in ['ridge', 'knn', 'dt', 'xgb']:
    print('Model:', model)
    evaluate_model_and_prepare_file(prepared_dataframe=prepared_df, ml_model=model, nearest_neighbors=nearest_neighbors, submission=False)
    print()

Model: ridge
F1 Scores:
[0.61297963 0.59414991 0.60194175]

Model: knn
F1 Scores:
[0.4997669  0.50480989 0.53697183]

Model: dt
F1 Scores:
[0.52597403 0.5106383  0.53345471]

Model: xgb


In [172]:
# evaluate_model_and_prepare_file(train_df=train_df, test_df=test_df, ml_model=ml_model, submission=True)