In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk, os, re, string, collections
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from  imageio import imread
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
import multidict as multidict #pip install multidict
import itertools
import csvtomd
import re
#%matplotlib inline

# 1. Limpieza y conversión de datos

In [3]:
dtype_test = {"id": np.int32, "keyword": "category"}
test = pd.read_csv("original_data/test.csv", dtype = dtype_test)

In [4]:
dtype_train = {"id": np.int32, "keyword": "category", "target" : bool}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

In [5]:
target_train = train.target

## 1.1 Keywords

In [6]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer  = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    if text == '' or pd.isnull(text):
        return text
    else:
        wordList = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
        s = ' '
        return s.join(wordList)

def clean_keyword_field(df):
    df["keyword"] = df["keyword"].str.replace('%20',' ')
    df_sin_keywords_nulos = df.dropna(subset = ['keyword'])
    keywords = np.unique(df_sin_keywords_nulos.keyword)
    for keyword in keywords:
        df.loc[(df['text'].str.contains(keyword, case=False) & df['keyword'].isnull()), 'keyword'] = keyword
    df['keyword'] = df['keyword'].apply(lemmatize_text)
    return df

## 1.2 Text

### Strict (for tags)

In [7]:
def getStopwordsList():
    fileNamesList = ["texts/99webtools.txt", "texts/atire_ncbi.txt", "texts/atire_puurula.txt", "texts/azure.txt", 
                 "texts/bbalet.txt", 
                 "texts/bow_short.txt", "texts/choi_2000naacl.txt", "texts/cook1988_function_words.txt", 
                 "texts/corenlp_acronym.txt", 
                 "texts/corenlp_hardcoded.txt", "texts/corenlp_stopwords.txt", "texts/datasciencedojo.txt", 
                 "texts/deeplearning4j.txt", 
                 "texts/dkpro.txt", "texts/mongodb.txt", "texts/galago_inquery.txt", "texts/gate_keyphrase.txt", 
                 "texts/gensim.txt", 
                 "texts/glasgow_stop_words.txt", "texts/indri.txt", "texts/kevinbouge.txt", "texts/lexisnexis.txt",
                 "texts/lingpipe.txt", 
                 "texts/mallet.txt", "texts/mysql_innodb.txt", "texts/mysql_myisam.txt", "texts/galago_rmstop.txt", 
                 "texts/atire_ncbi.txt", 
                 "texts/galago_rmstop.txt", "texts/nltk.txt", "texts/okapiframework.txt", "texts/okapi_cacm_expanded.txt", 
                 "texts/onix.txt", 
                 "texts/ovid.txt", "texts/postgresql.txt", "texts/pubmed.txt", "texts/quanteda.txt", "texts/r_tm.txt", 
                 "texts/ranksnl_large.txt", 
                 "texts/reuters_wos.txt", "texts/rouge_155.txt", "texts/scikitlearn.txt", "texts/smart.txt", 
                 "texts/snowball_expanded.txt", 
                 "texts/spacy.txt", "texts/spark_mllib.txt", "texts/sphinx_mirasvit.txt", "texts/t101_minimal.txt", 
                 "texts/taporware.txt", 
                 "texts/terrier.txt", "texts/tonybsk_1.txt", "texts/tonybsk_6.txt", "texts/voyant_taporware.txt", 
                 "texts/weka.txt", 
                 "texts/xapian.txt", "texts/xpo6.txt", "texts/zettair.txt"]
    stopwordsList = []
    for fileName in fileNamesList:
        file = open(fileName, "r")
        for line in file:
            stripped_line = line. strip()
            line_list = stripped_line
            if line_list not in stopwordsList:
                stopwordsList.append(line_list)
        file.close()
    return stopwordsList

stopwordsList = getStopwordsList()

print(len(stopwordsList))

2253


In [8]:
def clean_text_strict(text):
    text = text.lower()
    text = text.replace('\d+', '')
    invalid_chars = ['#','|','@','!','?','-','_','[',']','%','&',':','.',',',"''",'/','https','(','//t',')','http',
                 ';','\'']
    for char in invalid_chars:
        if char in text:
            text = text.replace(char,' ')
    #removes url and tags
    text = re.sub(r"(?:\@|https?\://)\S+", " ",  text)
    return text

### Non strict (for special characters, hashtags, etc)

In [9]:
def clean_text_non_strict(text): 
    tw = " "
    words = text.split(' ')
    for word in words:
        word = ''.join(filter(lambda x: x in set(string.printable), word))
        word = word.replace("\n"," ")
        word = word.replace('û',"")
        word = word.replace('Û',"")
        #word = word.replace('_','')
        #word = word.replace("\"",'')
        #word = word.strip('.')
        #word = word.strip(',')
        #word = word.strip(':')
        tw += word + " "
    return tw

## 1.3 Location

# 2. Features

## 2.1 Keywords

In [10]:
df_train = clean_keyword_field(train)

In [11]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,True
1,4,fire,,Forest fire near La Ronge Sask. Canada,True
2,5,evacuation,,All residents asked to 'shelter in place' are ...,True
3,6,evacuation,,"13,000 people receive #wildfires evacuation or...",True
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,True


In [12]:
def toText(listOfWords):
    s=' '
    return s.join(listOfWords)

def getFrequencyDictForText(sentence):
    fullTermsDict = multidict.MultiDict()
    tmpDict = {}

    # making dict for counting frequencies
    for text in sentence.split(" "):
        if re.match("a|the|an|the|to|in|for|of|or|by|with|is|on|that|be", text):
            continue
        val = tmpDict.get(text, 0)
        tmpDict[text.lower()] = val + 1
    for key in tmpDict:
        fullTermsDict.add(key, tmpDict[key])
    return fullTermsDict

def textContainsKeyword(row):
    if row.keyword == '' or pd.isnull(row.keyword):
        return False
    else:
        return row.keyword.lower() in row.text.lower()
    
def getKeywords(df):
    df_sin_keywords_nulos = df.dropna(subset = ['keyword'])
    return np.unique(df_sin_keywords_nulos.keyword)

def textContainsKeywordAsHashtag(row):
    hashtags = []
    tweet = row.text.lower()
    if row.keyword == '' or pd.isnull(row.keyword):
        return False
    else:
        for word in tweet.split(' '):
            if (len(word)>1) and (word[0] == '#'):
                hashtags.append(word[1:])
        return row.keyword.lower() in hashtags

In [13]:
all_keywords = pd.get_dummies(train[['id','keyword']],'keyword')
all_keywords

Unnamed: 0,id,keyword_ablaze,keyword_accident,keyword_aftershock,keyword_airplane accident,keyword_ambulance,keyword_annihilated,keyword_annihilation,keyword_apocalypse,keyword_armageddon,...,keyword_weapon,keyword_whirlwind,keyword_wild fire,keyword_wildfire,keyword_windstorm,keyword_wound,keyword_wounded,keyword_wreck,keyword_wreckage,keyword_wrecked
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,10870,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,10871,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7611,10872,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_keywords = df_train.copy()

In [15]:
#df_sin_keywords_nulos = df_keywords.dropna(subset=['keyword'])
#%time df_keywords_not_in_text = df_sin_keywords_nulos[~df_sin_keywords_nulos['text'].str.contains("|".join(df_sin_keywords_nulos['keyword']),case=False)]
#%time df_sin_keywords_nulos[df_sin_keywords_nulos.apply(lambda x: x.keyword in x.text, axis=1)]

In [16]:
df_keywords['text_contains_keyword'] = df_train.apply(textContainsKeyword,axis=1)
df_keywords.head()

Unnamed: 0,id,keyword,location,text,target,text_contains_keyword
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,True,True
1,4,fire,,Forest fire near La Ronge Sask. Canada,True,True
2,5,evacuation,,All residents asked to 'shelter in place' are ...,True,True
3,6,evacuation,,"13,000 people receive #wildfires evacuation or...",True,True
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,True,True


In [17]:
df_keywords['has_keyword'] = ~df_keywords.keyword.isnull()

In [18]:
keywords = getKeywords(df_keywords)
keywords

array(['ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'bomb', 'bombed',
       'bombing', 'bridge collapse', 'building burning',
       'building on fire', 'burned', 'burning', 'burning building',
       'bush fire', 'casualty', 'catastrophe', 'catastrophic',
       'chemical emergency', 'cliff fall', 'collapse', 'collapsed',
       'collide', 'collided', 'collision', 'crash', 'crashed', 'crush',
       'crushed', 'curfew', 'cyclone', 'damage', 'danger', 'dead',
       'death', 'debris', 'deluge', 'deluged', 'demolish', 'demolished',
       'demolition', 'derail', 'derailed', 'derailment', 'desolate',
       'desolation', 'destroy', 'destroyed', 'd

In [19]:
def countKeywords(tweet):
    quantity = 0
    for word in tweet.split(' '):
        word = re.sub('[^a-zA-Z]+', '', word)
        if word.lower() in keywords:
            quantity+=1
    return quantity

df_keywords['keywords_quantity'] = df_keywords.text.apply(countKeywords)
df_keywords

Unnamed: 0,id,keyword,location,text,target,text_contains_keyword,has_keyword,keywords_quantity
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,True,True,True,1
1,4,fire,,Forest fire near La Ronge Sask. Canada,True,True,True,1
2,5,evacuation,,All residents asked to 'shelter in place' are ...,True,True,True,1
3,6,evacuation,,"13,000 people receive #wildfires evacuation or...",True,True,True,1
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,True,True,True,1
...,...,...,...,...,...,...,...,...
7608,10869,bridge collapse,,Two giant cranes holding a bridge collapse int...,True,True,True,1
7609,10870,fire,,@aria_ahrary @TheTawniest The out of control w...,True,True,True,0
7610,10871,volcano,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,True,True,True,1
7611,10872,collide,,Police investigating after an e-bike collided ...,True,True,True,2


In [20]:
df_keywords['keywords_mean'] = df_keywords.text.apply(countKeywords)/keywords.size
df_keywords

Unnamed: 0,id,keyword,location,text,target,text_contains_keyword,has_keyword,keywords_quantity,keywords_mean
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,True,True,True,1,0.004739
1,4,fire,,Forest fire near La Ronge Sask. Canada,True,True,True,1,0.004739
2,5,evacuation,,All residents asked to 'shelter in place' are ...,True,True,True,1,0.004739
3,6,evacuation,,"13,000 people receive #wildfires evacuation or...",True,True,True,1,0.004739
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,True,True,True,1,0.004739
...,...,...,...,...,...,...,...,...,...
7608,10869,bridge collapse,,Two giant cranes holding a bridge collapse int...,True,True,True,1,0.004739
7609,10870,fire,,@aria_ahrary @TheTawniest The out of control w...,True,True,True,0,0.000000
7610,10871,volcano,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,True,True,True,1,0.004739
7611,10872,collide,,Police investigating after an e-bike collided ...,True,True,True,2,0.009479


In [21]:
df_keywords['keyword_is_hashtag'] = df_train.apply(textContainsKeywordAsHashtag,axis=1)
df_keywords

Unnamed: 0,id,keyword,location,text,target,text_contains_keyword,has_keyword,keywords_quantity,keywords_mean,keyword_is_hashtag
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,True,True,True,1,0.004739,True
1,4,fire,,Forest fire near La Ronge Sask. Canada,True,True,True,1,0.004739,False
2,5,evacuation,,All residents asked to 'shelter in place' are ...,True,True,True,1,0.004739,False
3,6,evacuation,,"13,000 people receive #wildfires evacuation or...",True,True,True,1,0.004739,False
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,True,True,True,1,0.004739,False
...,...,...,...,...,...,...,...,...,...,...
7608,10869,bridge collapse,,Two giant cranes holding a bridge collapse int...,True,True,True,1,0.004739,False
7609,10870,fire,,@aria_ahrary @TheTawniest The out of control w...,True,True,True,0,0.000000,False
7610,10871,volcano,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,True,True,True,1,0.004739,False
7611,10872,collide,,Police investigating after an e-bike collided ...,True,True,True,2,0.009479,False


In [22]:
# busco frecuencia de cada keyword para asignar un 'peso'
df_keywords['keyword_frequency'] = df_keywords.groupby('keyword')['id'].transform('count') / keywords.size
df_keywords

Unnamed: 0,id,keyword,location,text,target,text_contains_keyword,has_keyword,keywords_quantity,keywords_mean,keyword_is_hashtag,keyword_frequency
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,True,True,True,1,0.004739,True,0.199052
1,4,fire,,Forest fire near La Ronge Sask. Canada,True,True,True,1,0.004739,False,0.208531
2,5,evacuation,,All residents asked to 'shelter in place' are ...,True,True,True,1,0.004739,False,0.184834
3,6,evacuation,,"13,000 people receive #wildfires evacuation or...",True,True,True,1,0.004739,False,0.184834
4,7,fire,,Just got sent this photo from Ruby #Alaska as ...,True,True,True,1,0.004739,False,0.208531
...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,bridge collapse,,Two giant cranes holding a bridge collapse int...,True,True,True,1,0.004739,False,0.170616
7609,10870,fire,,@aria_ahrary @TheTawniest The out of control w...,True,True,True,0,0.000000,False,0.208531
7610,10871,volcano,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,True,True,True,1,0.004739,False,0.132701
7611,10872,collide,,Police investigating after an e-bike collided ...,True,True,True,2,0.009479,False,0.170616


In [23]:
#corrmap = df_keywords.corr()
#fig = plt.figure(figsize=(20,10))

#sns.heatmap(corrmap, vmax=.8, square=True)
#plt.show()

## 2.2 Text

In [24]:
#Funciones auxiliares
stopword = stopwords.words('english')

def stop(text):
    return [w for w in text.split() if w in stopword]

def length(text):
    return(np.mean([len(w) for w in text.split()]))

def punct(text):
    return [c for c in text if c in string.punctuation]

def title(text):
    return [w for w in text.split() if w.istitle()]

def upper_list(text):
    return [w for w in text.split() if w.isupper()]

def lower_list(text):
    return [w for w in text.split() if w.islower()]

def syllables(text):
    count = 0
    vowels = 'aeiouy'
    for word in text:
        word= word.lower()
        if word[0] in vowels:
            count +=1
        for index in range(1,len(word)):
            if word[index] in vowels and word[index-1] not in vowels:
                count +=1
        if word.endswith('e'):
            count -= 1
        if word.endswith('le'):
            count+=1
        if count == 0:
            count +=1
    return count

def quitar_link_twitter(tweet):
    res = []
    for w in tweet.split(" "):
        if (("http" not in w) and ("https" not in w)):
            res.append(w)
    return (" ").join(res)

def quitar_mencion_twitter(tweet):
    res = []
    for w in tweet.split(" "):
        if("@" not in w):
            res.append(w)
    return (" ").join(res)

def agregar_features_cantidad_palabras(df_og):
    df_og['text'] = df_og['text'].apply(lambda x: quitar_link_twitter(x))
    df_og['text'] = df_og['text'].apply(lambda x: quitar_mencion_twitter(x))
    palabras_list = [df_og]
    for df in palabras_list:
        # Número de palabras usadas
        df['#palabras'] = df['text'].apply(lambda x: len(x.split()))
        # Número de palabras únicas
        df['#palabras_unicas'] =  df['text'].apply(lambda x: len(set(x.split())))
        # Número de caracteres
        df['#caracteres'] =  df['text'].apply(lambda x: len(x))
        # Número de stopwords
        df['#stopwords'] = df['text'].apply(lambda x: len(stop(x)))
        # Número de caracteres de puntuación
        df['#putuacion'] = df['text'].apply(lambda x: len(punct(x)))
        # Número de palabras Capitalizadas
        df['#capitalize'] = df['text'].apply(lambda x: len(title(x)))
        # Número de palabras MAYUSCULAS
        df['#mayusculas'] = df['text'].apply(lambda x: len(upper_list(x)))
        # Número de silabas
        df['#silabas'] = df['text'].apply(lambda x: syllables(x))
        # Promedio de longitud del tweet
        df['promedio_len_word'] = df['text'].apply(lambda x: length(x))
    return df