# Real or Not? NLP with Disaster Tweets

Kaggle competition. Link: https://www.kaggle.com/c/nlp-getting-started/overview

## Table of Contents

1. [Set-up](#Set-up)
2. [Data Analysis](#Data-Analysis)
2. [Feature Engineering Functions](#Feature-Engineering-Functions)
4. [Data Preparation and Machine Learning](#Data-Preparation-and-Machine-Learning)

## Set-up

In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [47]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Data Analysis

In [4]:
df_test = train_df.copy()

In [5]:
df_test.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
print(df_test['target'].value_counts())
print(df_test['target'].value_counts(1))

0    4342
1    3271
Name: target, dtype: int64
0    0.57034
1    0.42966
Name: target, dtype: float64


Let's look at hastags

In [7]:
df_test['hashtag'] = df_test['text'].apply(lambda tweet: '#' in tweet)
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,True
1,4,,,Forest fire near La Ronge Sask. Canada,1,False
2,5,,,All residents asked to 'shelter in place' are ...,1,False
3,6,,,"13,000 people receive #wildfires evacuation or...",1,True
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,True


In [8]:
print(df_test[df_test['hashtag']]['target'].value_counts(1))
print(df_test[~df_test['hashtag']]['target'].value_counts(1))

0    0.503123
1    0.496877
Name: target, dtype: float64
0    0.590567
1    0.409433
Name: target, dtype: float64


Let's look at if there's any punctuation (right now just . , ' " ; :)

In [9]:
df_test['punctuation'] = df_test['text'].apply(lambda tweet: '.' in tweet or ',' in tweet or "," in tweet or '"' in tweet or ';' in tweet or ':' in tweet)
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,punctuation
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,True,False
1,4,,,Forest fire near La Ronge Sask. Canada,1,False,True
2,5,,,All residents asked to 'shelter in place' are ...,1,False,True
3,6,,,"13,000 people receive #wildfires evacuation or...",1,True,True
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,True,False


In [10]:
df_test['text'].iloc[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [11]:
print(df_test[df_test['punctuation']]['target'].value_counts(1))
print(df_test[~df_test['punctuation']]['target'].value_counts(1))

0    0.519102
1    0.480898
Name: target, dtype: float64
0    0.756231
1    0.243769
Name: target, dtype: float64


Let's look at some numerical stuff

In [12]:
df_test['tweet length'] = df_test['text'].apply(lambda tweet: len(tweet))
df_test.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,punctuation,tweet length
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,True,False,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,False,True,38
2,5,,,All residents asked to 'shelter in place' are ...,1,False,True,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,True,True,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,True,False,88


In [13]:
df_test.corr()

Unnamed: 0,id,target,hashtag,punctuation,tweet length
id,1.0,0.060781,-0.008026,0.012285,0.017393
target,0.060781,1.0,0.074486,0.19715,0.181817
hashtag,-0.008026,0.074486,1.0,0.12603,0.208775
punctuation,0.012285,0.19715,0.12603,1.0,0.40342
tweet length,0.017393,0.181817,0.208775,0.40342,1.0


## Feature Engineering Functions

In [14]:
def add_substring_in_tweet_column(substring, df):
    """Function will add new column to df.
    New column will be of boolean type: True if substring is in that row's tweet, False otherwise.
    Note: if substring is all lowercase, will assume case does not matter and so will search if substring is in
    lowercase version of tweet. If case matters, then substring must contain at least one capital. This is an assumption."""
    
    new_column_name = "'" + substring + "' in tweet?" # can tell if case matters if substring has capitals
    match_case = not substring == substring.lower()
    
    if match_case:
        df[new_column_name] = df['text'].apply(lambda tweet: substring in tweet)
    
    else:
        df[new_column_name] = df['text'].apply(lambda tweet: substring in tweet.lower())
        
# add_substring_in_tweet_column('.', df_test)
# add_substring_in_tweet_column('our', df_test)
# add_substring_in_tweet_column('fire', df_test)
# display(df_test.head())

In [15]:
def sentiment_analyze_tweet(df, method):
    pass

## Data Preparation and Machine Learning

In [64]:
cols_to_drop = ['id', 'keyword', 'location', 'text']
substring_list = ['#', '.', ',', ';', ':', "'"]
numeric_columns = [] # we'll see, could just be boolean
sentiment_analysis = True
encode_keyword = False

In [65]:
def prepare_dataframe_for_machine_learning(dataframe=train_df):
    
    # Prepare a copy, so that experimenting is easy
    df = dataframe.copy()
    
    # Add substring columns
    for substring in substring_list:
        add_substring_in_tweet_column(substring, df)
        
    # Add numeric columns
    for col in numeric_columns:
        pass
    df['tweet length'] = df['text'].apply(lambda tweet: len(tweet))
    
    # Add sentiment analysis, if true
    if sentiment_analysis:
        sentiment_analyze_tweet(df, method=None)
        
    # Encode keyword, if true
    if encode_keyword:
        encoded_keys = pd.get_dummies(df['keyword'])
        df = pd.concat([df, encoded_keys], axis=1)
    
    # Drop columns not wanted for machine learning
    df.drop(cols_to_drop, axis=1, inplace=True)
    
    # Return prepared dataframe
    return df

prepared_df = prepare_dataframe_for_machine_learning(train_df)
display(prepared_df.head())

Unnamed: 0,target,'#' in tweet?,'.' in tweet?,"',' in tweet?",';' in tweet?,':' in tweet?,''' in tweet?,tweet length
0,1,True,False,False,False,False,False,69
1,1,False,True,False,False,False,False,38
2,1,False,True,False,False,False,True,133
3,1,True,False,True,False,False,False,65
4,1,True,False,False,False,False,False,88


In [66]:
prepared_df.corr()['target']

target           1.000000
'#' in tweet?    0.074486
'.' in tweet?    0.200136
',' in tweet?    0.018677
';' in tweet?    0.000262
':' in tweet?    0.254143
''' in tweet?   -0.107381
tweet length     0.181817
Name: target, dtype: float64

In [57]:
# From the example, will play around with later
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, prepared_df.drop('target', axis=1),
                                         prepared_df["target"], cv=3, scoring="f1")
print(scores)

[0.56651584 0.57496683 0.5531281 ]


In [58]:
# For scoring, just write in word
scores = model_selection.cross_val_score(clf, prepared_df.drop('target', axis=1),
                                         prepared_df["target"], cv=3, scoring="accuracy")
print(scores)

[0.6226861  0.62120615 0.6452503 ]
