# Final function

### import packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import os
import itertools
from collections import Counter
import re
import string

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
import contractions
from wordcloud import WordCloud

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score
from sklearn import metrics

from joblib import dump, load

In [2]:
#load the stopwords, punctuation and lemmatizer
punc=string.punctuation
stpwrd = stopwords.words('english')
stpwrd.extend(['...','wa','ha','via',"'s",'amp'])
lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()
l_stemmer = LancasterStemmer()

In [3]:
train=pd.read_csv('train_dt.csv')
test=pd.read_csv('test_dt.csv')

## data cleaning and feature engineering function

In [4]:
#load the vectorizers,scaler and models
clf = load('clf.joblib') 
hashtags_vec=load('hashtags_vec.joblib')
text_vec=load('text_vec.joblib')
scaler=load('scaler.joblib')

In [5]:
#a function to find all hashtags in text
def find_hashtag(text):
    hashtags=re.findall(r'#(\w+)',text.lower())
    hashtags=" ".join(hashtags)
    return hashtags

#a function to clean the text
def clean_text_(text,stemmer):
    #remove regex
    regexs = ["(http|www)(\S+)", #remove URL
              r'(@.+?)\s', #remove twitter usernames
                  r'\d+', #remove numbers
                  r'(//t.co/.+?)\s', #remove twitter shortened URLs
              r'#(\w+)', #remove hashtags
              r'[^\x00-\x7F]+' #remove non_english characters
             ]
    for regex in regexs:
        text=re.sub(regex,"",text.lower())
    #expand the contractions
    words=text.lower().split()
    words=[contractions.fix(word) for word in words]
    text=" ".join(words)
    #tokenize text
    words=word_tokenize(text.lower())
    #stem or lamatize the tokens
    words=[stemmer(word) for word in words]
    #remove stopwords and punctuation
    words=[word for word in words if word not in stpwrd]
    words=[word for word in words if word not in punc]
    text=" ".join(words)
    return text

#a function to apply word countvectorizer
def apply_vectorizer(df,col,vectorizer):
    features = vectorizer.transform(df[col])
    features = features.toarray()
    df_features=pd.DataFrame(data=features,columns=vectorizer.get_feature_names_out(),index=df.index)
    return df_features

#a function combined with data cleaning and feature engineering
def clean_eng(train):
    
    train=train[['text']].copy()
    
    #find length of text
    train['text_length']=train['text'].map(len)

    #find number of upper case characters in text
    train['nos_uppercase']=train['text'].apply(lambda text:sum(char.isupper() for char in text))

    #find number of punctuation in text
    train['nos_punc']=train['text'].apply(lambda text:sum(char in punc for char in text))


    #find number of words not include punctuation 
    train['nos_words']=train['text'].\
    apply(lambda text:len([word for word in word_tokenize(text) if word not in punc]))

    #find percentage of upper case characters in text
    train['perc_uppercase']=train['nos_uppercase']/train['text_length']
    #find percentage of punctuation in text
    train['perc_punc']=train['nos_punc']/train['text_length']


    #find all hashtags in text
    train['hashtags']=train['text'].apply(find_hashtag)

    #find number of hashtags in text
    train['nos_hashtags']=train['hashtags'].apply(lambda text:len(text.split()))
    #find percentage of hashtags in text
    train['perc_hashtags']=train['nos_hashtags']/train['nos_words']

    #apply cleaning text function
    train['clean_text']=train['text'].apply(clean_text_,stemmer=lemmatizer.lemmatize)
    
    all_cols=list(train.columns)

    numeric_cols=[]
    text_cols=[]
    for col in all_cols:
        data_type=train[col].dtype
        if data_type=='O':
            text_cols.append(col)
        else:
            numeric_cols.append(col)

    #remove original text        
    if 'text' in text_cols:
        text_cols.remove('text')
    
    #remove number of uppercase, punctuation and hashtags columns
    #they are used to create the respective percentage columns
    for col in ['nos_uppercase','nos_punc','nos_hashtags']:
        numeric_cols.remove(col)
    
    train=train[numeric_cols+text_cols]     
    
    #apply the word countvectorizer
    train_text=apply_vectorizer(train,'clean_text',text_vec)
    train_hashtags=apply_vectorizer(train,'hashtags',hashtags_vec)
    #join the data of numeric columns with the data of cleaned text and data of hashtags after apply vectorizer
    train_join=train[numeric_cols]\
        .join(train_text)\
        .join(train_hashtags,lsuffix='t',rsuffix='h')
    
    #apply scaling to numeric columns that is not percentage
    scaler_cols=[col for col in numeric_cols if not re.match(r"perc\w*",col)]
    
    #apply the min max scaler
    scaled_features=scaler.transform(train[scaler_cols])
    scaled_features=pd.DataFrame(data=scaled_features,columns=scaler.feature_names_in_,index=train.index)
    for col in scaled_features:
        train_join[col]=scaled_features[col]
    
    return train_join

In [11]:
# my_tweet="Tornado is coming. We need help. Please help us. #SOS #disaster #Tornado #emergency"
# df_my_tweet=pd.DataFrame(data=[my_tweet],columns=['text'])
# df_my_tweet=clean_eng(df_my_tweet)
# clf.predict_proba(df_my_tweet)[:,1][0]

0.6457296231375262

## Final prediction function

final function for prediction <br>
input: tweet: str, cut off (default to be 0.5): float <br>
output: probablility of the tweet is about real disaster: float, True if probability > cut off False otherwise <br>

In [6]:
def predict_disaster_tweet(tweet,cut_off=0.5):
    df_tweet=clean_eng(pd.DataFrame(data=[tweet],columns=['text']))
    proba=clf.predict_proba(df_tweet)[:,1][0]
    return proba,proba>cut_off

the followings are the demo of function based on tweets written by myself

disaster tweets

In [7]:
tweet="Emergency situation. There is a wildfire. PLEASE SEND HELP. If no help, people will die. #SOS #WILDFIRE"
predict_disaster_tweet(tweet)

(0.7782116747187081, True)

In [8]:
tweet="SOS! Please send help. Emergency situation. People is suffering. #SOS #Emergency"
predict_disaster_tweet(tweet)

(0.5482150664041652, True)

non disaster tweets

In [9]:
tweet='Haha! Send help ar! Someone needs help ar!'
predict_disaster_tweet(tweet)

(0.13317355414206983, False)

In [10]:
tweet="Cry! He is dying! Call 911 la!"
predict_disaster_tweet(tweet)

(0.17018757993706637, False)

In [11]:
tweets=[
    "SOS",
    "Help us. Hurricane is happening. Many people are hurt. #emergency #911 #hurricane",
    "God save the queen",
    "Wildfire is happening. Send help. The fire is growing strongly. If no help, people die.",  
]
for tweet in tweets:
    print(tweet)
    print(predict_disaster_tweet(tweet))
    print("")

SOS
(0.08814375797465712, False)

Help us. Hurricane is happening. Many people are hurt. #emergency #911 #hurricane
(0.5717238491318939, True)

God save the queen
(0.0750019085660341, False)

Wildfire is happening. Send help. The fire is growing strongly. If no help, people die.
(0.7194949112207033, True)



# Demo

The demo in my capsetone presentation

In [12]:
tweets=[
    "Omg, I was on the street and I saw a giant dog barking, I was so scared #doglife",
    "No milk in the shopps? deffo sign of the apocolipse! #endisneigh",
    "my shoes are full of salt! Please send help! this is worse than chernobyl! #FootDisaster #Sore",
    "OMG I just felt the worst earthquake ever, what is happening? #wtf"
]
for tweet in tweets:
    print(tweet)
    print(predict_disaster_tweet(tweet))
    print("")

Omg, I was on the street and I saw a giant dog barking, I was so scared #doglife
(0.3231025403944372, False)

No milk in the shopps? deffo sign of the apocolipse! #endisneigh
(0.18842359005460785, False)

my shoes are full of salt! Please send help! this is worse than chernobyl! #FootDisaster #Sore
(0.15275135156274358, False)

OMG I just felt the worst earthquake ever, what is happening? #wtf
(0.46850133798744703, False)



In [23]:
tweets = [
    "Danger, Danger. High voltage. When we touch and when we kiss",
    "my goldfish is on fire",
    "Im at Kaggle and everyone is panicing! some random trainee data scientist just came in and smashed our competition in less then a week #catastrophy",
    "Theres a Random Forest Fire! #TheBetterModel",
    "I just heard the ISS was smashed by some random bar and pieces are crashing back to earth #doomed ",
    "Oh no, pineapples are falling from the sky! #Apocalyse ",
    "Incredibly sad to hear about the ongoing conflict on Jupiter resulting in the death of 16 McNuggets. My heart goes out to the delicious victims #StopWar",
    "Ran out of coffee #endoftheworld "

]
for tweet in tweets:
    print(tweet)
    print(predict_disaster_tweet(tweet))
    print("")

Danger, Danger. High voltage. When we touch and when we kiss
(0.13220105698386633, False)

my goldfish is on fire
(0.6385670502702395, True)

Im at Kaggle and everyone is panicing! some random trainee data scientist just came in and smashed our competition in less then a week #catastrophy
(0.3182729277507906, False)

Theres a Random Forest Fire! #TheBetterModel
(0.8384814278167484, True)

I just heard the ISS was smashed by some random bar and pieces are crashing back to earth #doomed 
(0.19630275230755156, False)

Oh no, pineapples are falling from the sky! #Apocalyse 
(0.2564231449932489, False)

Incredibly sad to hear about the ongoing conflict on Jupiter resulting in the death of 16 McNuggets. My heart goes out to the delicious victims #StopWar
(0.6432899159438875, True)

Ran out of coffee #endoftheworld 
(0.18701731973500244, False)

Danger, Danger. High voltage. When we touch and when we kiss
(0.13220105698386633, False)

my goldfish is on fire
(0.6385670502702395, True)

Im at K

## Apply model for competition

I apply my model to data (test.csv) in Kaggle competition. I created a submission file for competition.

In [29]:
test_cleaned=clean_eng(test)

In [30]:
test_cleaned.head()

Unnamed: 0,text_length,nos_words,perc_uppercase,perc_punc,perc_hashtags,aa,aba,abandoned,abbswinston,abc,...,wx,yazidis,yesh,yonews,youtubeh,yugvani,yyc,yycstorm,zionism,zionist
0,0.18,0.16129,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.38,0.258065,0.015625,0.046875,0.111111,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.593333,0.612903,0.010417,0.020833,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.22,0.096774,0.05,0.075,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.253333,0.225806,0.088889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
y_test_pred=clf.predict(test_cleaned)

In [32]:
df_submission=pd.DataFrame()
df_submission['id']=test['id']
df_submission['target']=y_test_pred

In [33]:
df_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [34]:
df_submission.to_csv('submission.csv',index=False)