In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from collections import Counter
import itertools

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk import word_tokenize
import spacy
from nltk.corpus import stopwords
import string

from textblob import TextBlob

In [2]:
DATA_PATH = 'data/'

TRAIN_FEATURES = os.path.join(DATA_PATH, 'train_text_cleaned.csv')

In [3]:
train_df = pd.read_csv(TRAIN_FEATURES, index_col=0, encoding='utf-8')

### Lemmatization and stop words elimination

In [4]:
# Loading model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Lemmatization with stopwords removal

def text_lem_stopwords(df=train_df, col='text_cleaned'):
    df['preprocessed_text'] = (
        df[col].apply(lambda x:
                      ' '.join([w.lemma_ for w in list(nlp(x)) if (w.is_stop == False)]))
    )
    return df

In [5]:
train_df = text_lem_stopwords(df=train_df, col='text_cleaned')
train_df

Unnamed: 0,text,target,text_cleaned,preprocessed_text
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this earthquake may allah forgive us all,deed reason earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected,resident ask shelter place notify officer evacuation shelter place order expect
3,"13,000 people receive #wildfires evacuation orders in California",1,people receive wildfires evacuation orders in california,people receive wildfire evacuation order california
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,get send photo ruby alaska smoke wildfire pour school
...,...,...,...,...
7515,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1,two giant cranes holding a bridge collapse into nearby homes,giant crane hold bridge collapse nearby home
7516,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1,the out of control wild fires in california even in the northern part of the state very troubling,control wild fire california northern state troubling
7517,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1,m utc km s of volcano hawaii,m utc km s volcano hawaii
7518,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1,police investigating after an e bike collided with a car in little portugal e bike rider suffered serious non life threatening injuries,police investigate e bike collide car little portugal e bike rider suffer non life threaten injury


### Feature extraction

 - sentence_count - total number of sentences
 - word_count - number of words in text
 - unique_word_count - number of unique words in text
 - unique_word_share - ratio between number of unique words and number of total words
 - stop_word_count - number of stop words in text
 - stop_word_share - ratio between number of stopwords and number of total words
 - url_count - number of urls in text
 - mean_word_length - average word length count in text
 - char_count - number of characters in text
 - punctuation_count - number of punctuations in text
 - hashtag_count - number of hashtags (#) in text
 - mention_count - number of mentions (@) in text

 - polarity_raw, polarity_preprocessed - polarity in raw and preprocessed text respectively using textblob
 - subjectivity_raw, subjectivity_preprocessed - subjectivity in raw and preprocessed text respectively using textblob
 - tags_count - “tagging” parts of speech

In [6]:
def sentence_count(tweet_df, col='text'):
    tweet_df['sentence_count'] = tweet_df[col].apply(lambda x: x.count('\n') + 1)
    return tweet_df

In [7]:
def word_count(tweet_df, col='text'):
    tweet_df['word_count'] = tweet_df[col].apply(lambda x: len(str(x).split()))
    return tweet_df

In [8]:
def unique_word_count(tweet_df, col='text'):
    tweet_df['unique_word_count'] = (
            tweet_df[col].apply(lambda x: len(set(str(x).split())))
    )
    return tweet_df

In [9]:
def unique_word_share(tweet_df, col='text'):
    tweet_df['unique_word_share'] = (
            tweet_df[col].apply(lambda x: len(set(x.split())) / len(x.split()) 
                                if len(x.split())!=0 else 0)
    )
    
    return tweet_df

In [10]:
stop_words = set(stopwords.words('english'))

def stop_word_count(tweet_df, col='text'):
    tweet_df['stop_word_count'] = (
        tweet_df[col]
            .apply(lambda x:
                   len([w for w in str(x).lower().split()
                        if w in stop_words]))
    )
    return tweet_df

In [11]:
def stop_word_share(tweet_df, col='text'):
    tweet_df['stop_word_share'] = (
            tweet_df[col].apply(lambda x:
                                len([w for w in str(x).lower().split()
                                     if w in stop_words]) / len(x.split())
                               if len(x.split())!=0 else 0)
    )
    return tweet_df

In [12]:
def url_count(tweet_df, col='text'):
    tweet_df['url_count'] = (
                tweet_df[col]
                    .apply(lambda x: 
                           len([w for w in str(x).lower().split() 
                            if 'http' in w or 'https' in w]))
    )
    return tweet_df

In [13]:
def mean_word_length(tweet_df, col='text'):
    tweet_df['mean_word_length'] = (
            tweet_df[col].apply(
                lambda x: np.mean([len(w) for w in str(x).split()]))
    )
    return tweet_df

In [14]:
def char_count(tweet_df, col='text'):
    tweet_df['char_count'] = (
            tweet_df[col].apply(lambda x: len(str(x)))
    )
    return tweet_df

In [15]:
def punctuation_count(tweet_df, col='text'):
    tweet_df['punctuation_count'] = (
            tweet_df[col].apply(
                lambda x: len([c for c in str(x) if c in string.punctuation]))
    )
    return tweet_df

In [16]:
def hashtag_count(tweet_df, col='text'):
    tweet_df['hashtag_count'] = (
            tweet_df[col].apply(
                lambda x: len([c for c in str(x) if c == '#']))
    )
    return tweet_df

In [17]:
def mention_count(tweet_df, col='text'):
    tweet_df['mention_count'] = (
            tweet_df[col].apply(
                lambda x: len([c for c in str(x) if c == '@']))
    )
    return tweet_df

In [18]:
def polarity_raw(tweet_df, col='text'):
    tweet_df['polarity_raw'] = (
            tweet_df[col].apply(
                lambda x: TextBlob(x).sentiment[0])
    )
    return tweet_df

In [19]:
def polarity(tweet_df, col='text'):
    tweet_df['polarity'] = (
            tweet_df[col].apply(
                lambda x: TextBlob(x).sentiment[0])
    )
    return tweet_df

In [20]:
def subjectivity_raw(tweet_df, col='text'):
    tweet_df['subjectivity_raw'] = (
            tweet_df[col].apply(
                lambda x: TextBlob(x).sentiment[1])
    )
    return tweet_df

In [21]:
def subjectivity(tweet_df, col='text'):
    tweet_df['subjectivity'] = (
            tweet_df[col].apply(
                lambda x: TextBlob(x).sentiment[1])
    )
    return tweet_df

In [22]:
def tweet_feature_engineering_cleaned(tweet_df, tweet_col='text'):
    tweet_new_features = tweet_df.copy()
    tweet_new_features = sentence_count(tweet_new_features, col=tweet_col)
    tweet_new_features = word_count(tweet_new_features, col=tweet_col)
    tweet_new_features = stop_word_count(tweet_new_features, col=tweet_col)
    tweet_new_features = stop_word_share(tweet_new_features, col=tweet_col)
    tweet_new_features = url_count(tweet_new_features, col=tweet_col)
    tweet_new_features = mean_word_length(tweet_new_features, col=tweet_col)
    tweet_new_features = char_count(tweet_new_features, col=tweet_col)
    tweet_new_features = punctuation_count(tweet_new_features, col=tweet_col)
    tweet_new_features = hashtag_count(tweet_new_features, col=tweet_col)
    tweet_new_features = mention_count(tweet_new_features, col=tweet_col)
    tweet_new_features = polarity_raw(tweet_new_features, col=tweet_col)
    tweet_new_features = subjectivity_raw(tweet_new_features, col=tweet_col)
    
    return tweet_new_features

In [23]:
def tweet_feature_engineering_preprocessed(tweet_df, tweet_col='preprocessed_text'):
    tweet_new_features = tweet_df.copy()
    tweet_new_features = unique_word_count(tweet_new_features, col=tweet_col)
    tweet_new_features = unique_word_share(tweet_new_features, col=tweet_col)
    tweet_new_features = polarity(tweet_new_features, col=tweet_col)
    tweet_new_features = subjectivity(tweet_new_features, col=tweet_col)
    
    return tweet_new_features

In [24]:
def tags_count(tweet_df, tweet_col='text_cleaned'):
    tweet_new_features = tweet_df.copy()
    text_tag = tweet_new_features[tweet_col].apply(lambda x: TextBlob(x).tags)
    tagList = [Counter(pair[1] for pair in row).most_common() for row in text_tag]
    for index, row_tag in enumerate(tagList):
        for key, value in row_tag:
            tweet_new_features.loc[index,key] = value

    tweet_new_features.fillna(0, inplace = True)
    
    return tweet_new_features

In [25]:
train_df_new = tweet_feature_engineering_cleaned(train_df, tweet_col='text')
train_df_new = tweet_feature_engineering_preprocessed(train_df_new, tweet_col='preprocessed_text')
train_df_new = tags_count(train_df_new, tweet_col='text_cleaned')

In [26]:
train_df_new.head(5)

Unnamed: 0,text,target,text_cleaned,preprocessed_text,sentence_count,word_count,stop_word_count,stop_word_share,url_count,mean_word_length,char_count,punctuation_count,hashtag_count,mention_count,polarity_raw,subjectivity_raw,unique_word_count,unique_word_share,polarity,subjectivity,DT,NN,PRP$,NNS,VBP,IN,MD,VB,JJ,PRP,JJS,VBN,VBD,TO,VBG,CC,RB,EX,VBZ,CD,WP,RP,JJR,WRB,FW,WDT,NNP,RBR,RBS,PDT,SYM,UH,WP$
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this earthquake may allah forgive us all,deed reason earthquake allah forgive,1,13,6,0.461538,0,4.384615,69,1,1,0,0.0,0.0,5,1.0,0.0,0.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada,1,7,0,0.0,0,4.571429,38,1,0,0,0.1,0.4,7,1.0,0.1,0.4,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected,resident ask shelter place notify officer evacuation shelter place order expect,1,22,11,0.5,0,5.090909,133,3,0,0,-0.01875,0.3875,9,0.818182,0.0,0.0,2.0,4.0,0.0,3.0,2.0,3.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"13,000 people receive #wildfires evacuation orders in California",1,people receive wildfires evacuation orders in california,people receive wildfire evacuation order california,1,8,1,0.125,0,7.125,65,2,1,0,0.0,0.0,6,1.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,get send photo ruby alaska smoke wildfire pour school,1,16,7,0.4375,0,4.5,88,2,2,0,0.0,0.0,9,1.0,0.0,0.0,2.0,4.0,0.0,1.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
train_df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7520 entries, 0 to 7519
Data columns (total 53 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   text               7520 non-null   object 
 1   target             7520 non-null   int64  
 2   text_cleaned       7520 non-null   object 
 3   preprocessed_text  7520 non-null   object 
 4   sentence_count     7520 non-null   int64  
 5   word_count         7520 non-null   int64  
 6   stop_word_count    7520 non-null   int64  
 7   stop_word_share    7520 non-null   float64
 8   url_count          7520 non-null   int64  
 9   mean_word_length   7520 non-null   float64
 10  char_count         7520 non-null   int64  
 11  punctuation_count  7520 non-null   int64  
 12  hashtag_count      7520 non-null   int64  
 13  mention_count      7520 non-null   int64  
 14  polarity_raw       7520 non-null   float64
 15  subjectivity_raw   7520 non-null   float64
 16  unique_word_count  7520 

In [28]:
feature_analysis = list(train_df_new)[4:]
feature_analysis

['sentence_count',
 'word_count',
 'stop_word_count',
 'stop_word_share',
 'url_count',
 'mean_word_length',
 'char_count',
 'punctuation_count',
 'hashtag_count',
 'mention_count',
 'polarity_raw',
 'subjectivity_raw',
 'unique_word_count',
 'unique_word_share',
 'polarity',
 'subjectivity',
 'DT',
 'NN',
 'PRP$',
 'NNS',
 'VBP',
 'IN',
 'MD',
 'VB',
 'JJ',
 'PRP',
 'JJS',
 'VBN',
 'VBD',
 'TO',
 'VBG',
 'CC',
 'RB',
 'EX',
 'VBZ',
 'CD',
 'WP',
 'RP',
 'JJR',
 'WRB',
 'FW',
 'WDT',
 'NNP',
 'RBR',
 'RBS',
 'PDT',
 'SYM',
 'UH',
 'WP$']

### Write csv file with new features

In [29]:
TRAIN_NEW = os.path.join(DATA_PATH, 'train_new_features.csv')
df_train = train_df_new
df_train.to_csv(TRAIN_NEW, header=True)