In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
from urllib.request import urlopen, HTTPError, URLError
from urllib.parse import urlparse
import re
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from bs4 import BeautifulSoup
import random

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
def is_not_nan(value):
    return value == value

In [3]:
twt_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train.csv')

In [4]:
twt_data.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [5]:
twt_data.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.93,0.43
std,3137.12,0.5
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [6]:
twt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 208.2+ KB


In [7]:
def obtain_link(twt):
    url = re.search("(?P<url>https?://[^\s]+)", twt)
    return url.group("url") if url else np.NaN  

In [8]:
twt_data['link'] = twt_data['text'].map(obtain_link)
twt_data

Unnamed: 0,id,keyword,location,text,target,link
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,
1,4,,,Forest fire near La Ronge Sask. Canada,1,
2,5,,,All residents asked to 'shelter in place' are ...,1,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ
7611,10872,,,Police investigating after an e-bike collided ...,1,


In [9]:
twt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
 5   link      3971 non-null   object
dtypes: int64(2), object(4)
memory usage: 238.0+ KB


In [10]:
def isShorted(link):
    if link != link: return np.NaN 
    return 't.co' in str(link)

In [11]:
twt_data['link_shorted'] = twt_data['link'].map(isShorted)


## A continuacion trabajaremos exclusivamente con los twits que poseen links

In [12]:
twt_data_with_links = twt_data.dropna(subset=['link'])
twt_data_with_links = twt_data_with_links[twt_data_with_links['link_shorted']]
twt_data_with_links

Unnamed: 0,id,keyword,location,text,target,link,link_shorted
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,http://t.co/lHYXEOHY6C,True
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,http://t.co/2nndBGwyEi,True
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1,http://t.co/3ImaomknnA,True
...,...,...,...,...,...,...,...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1,http://t.co/3SICroAaNz,True
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ,True


In [13]:
twt_data_with_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3968 entries, 31 to 7612
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            3968 non-null   int64 
 1   keyword       3948 non-null   object
 2   location      2714 non-null   object
 3   text          3968 non-null   object
 4   target        3968 non-null   int64 
 5   link          3968 non-null   object
 6   link_shorted  3968 non-null   object
dtypes: int64(2), object(5)
memory usage: 170.5+ KB


In [14]:
twt_data_with_links['real_link'] = 'np.nan'
twt_data_with_links['error'] = 'np.nan'
twt_data_with_links['page'] = 'np.nan'
twt_data_with_links = twt_data_with_links.set_index('id')
twt_data_with_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3968 entries, 48 to 10873
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   keyword       3948 non-null   object
 1   location      2714 non-null   object
 2   text          3968 non-null   object
 3   target        3968 non-null   int64 
 4   link          3968 non-null   object
 5   link_shorted  3968 non-null   object
 6   real_link     3968 non-null   object
 7   error         3968 non-null   object
 8   page          3968 non-null   object
dtypes: int64(1), object(8)
memory usage: 186.0+ KB


In [15]:
def obtain_real_link(index, url):
    try:
        with urlopen(url, timeout = 20) as response:
            actual_url = response.geturl()
            error = 0
            page = response.read()
    except HTTPError as e:
        actual_url = e.url
        error = e
        page = 0
    except URLError as e:
        actual_url = url
        error = e
        page = 0
    except:
        actual_url = url
        error = 'OTHER_ERROR'
        page = 0
    return (index, actual_url, error, page)

In [16]:
def doWork(row):
    return obtain_real_link(row[0], row[1]['link'])

In [17]:
# con esta pieza de codigo obtenemos los links reales y las paginas de forma paralelizada
def requestPage(dataFrame):
    init = 0
    aux_iter = 0
    index = -1
    retry = 0
    with ThreadPoolExecutor(max_workers=50) as pool:
        while True:
            try:
                for result in pool.map(doWork, dataFrame.iloc[init:].iterrows(), timeout=110):
                    index = result[0]
                    twt_data_with_links['real_link'][index] = result[1]
                    twt_data_with_links['error'][index] = result[2]
                    twt_data_with_links['page'][index] = result[3]
                    retry = 0
                    aux_iter = aux_iter + 1
                break
            except TimeoutError:
                if (retry == 2):
                    index = dataFrame.index[aux_iter]
                    print('TIME OUT:', aux_iter, index)
                    twt_data_with_links['real_link'][index] = 0
                    twt_data_with_links['error'][index] = 'TIME OUT'
                    twt_data_with_links['page'][index] = 0
                    retry == 0
                    init = init + 1
                    aux_iter = aux_iter + 1
                else:
                    retry = retry + 1
                    init = aux_iter
            except KeyboardInterrupt:
                print('Terminado por el usuario')
        return pool

In [None]:
iteraciones = 0
while iteraciones < 5:
    requestPage(twt_data_with_links.loc[twt_data_with_links['error'] != 0]).shutdown(wait=True)

TIME OUT: 788 2314


In [None]:
twt_data_with_links.loc[twt_data_with_links['error'] != 0]

In [None]:
with urlopen('http://t.co/FWqfCKNCQW', timeout = 20) as response:
    actual_url = response.geturl()
    error = 0
    page = response.read()

In [None]:
twt_data_with_links['error_code'] = twt_data_with_links['error'].map(lambda x: x.status if isinstance(x, HTTPError) else np.NaN)

In [None]:
twt_data_with_links.info()

In [None]:
twt_data_with_links['site'] = twt_data_with_links['real_link'].map(lambda x: urlparse(x).netloc)
twt_data_with_links

In [None]:
twt_data_with_links['for_graphics'] = 1
grouped_by_sites = twt_data_with_links.groupby('site').sum()
grouped_by_sites

### Dejaremos afuera el sition t.co y las lineas que tienen como sitio twitter.com y su respuesta fue un error

In [None]:
def filter_twitter_error(row):
    return row.site != 't.co' and (row.site != 'twitter.com' or row.error != row.error)

In [None]:
filter_list = []
for index, row in twt_data_with_links.iterrows():
    filter_list.append(filter_twitter_error(row))
twt_data_with_links.loc[filter_list]

## A continuacion nos quedaremos exclusivamente con las lineas sin errores para analizar las paguinas

In [None]:
twt_data_without_error = twt_data_with_links.loc[twt_data_with_links['error'] != twt_data_with_links['error']]
twt_data_without_error

In [None]:
def get_the_header(page):
    soup = BeautifulSoup(page, "lxml")
    header = str(soup.h1.string) if soup.h1 else ''
    soup.decompose()
    return header

In [None]:
twt_data_without_error = pd.read_csv('../../../DataSets/twt_with_header.csv').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
twt_data_without_error

In [None]:
twt_data_without_error.info()

## A continuacion nos quedaremos exclusivamente paguinas que nos devolvieron un header para analizar el mismo

In [None]:
twt_data_without_error = twt_data_without_error.loc[twt_data_without_error["header"] != 'None']
twt_data_without_error = twt_data_without_error.loc[twt_data_without_error["header"] != '']
twt_data_without_error = twt_data_without_error.loc[twt_data_without_error["header"] == twt_data_without_error["header"]]
twt_data_without_error

In [None]:
#repeticion_de_palabras
twt_data_without_error["header"] = twt_data_without_error["header"].str.lower().str.strip() #transformo todas las palabras a minuscula
twt_data_without_error["header"] = twt_data_without_error["header"].apply(lambda x: re.sub(r'[^\w]', ' ', x)) #quito todos los caracteres no alfanumericos
twt_data_without_error["header"] = twt_data_without_error["header"].apply(lambda x: re.sub(r'_', ' ', x))
twt_data_without_error["header"] = twt_data_without_error["header"].apply(lambda x: re.sub(r'[0-9]',' ', x)) #quito los caracteres numericos
twt_data_without_error["header"] = twt_data_without_error["header"].apply(lambda s: re.sub(r'\b\w{1}\b', '', s)) #quito las letras sueltas
twt_data_without_error["header"] = twt_data_without_error["header"].apply(lambda x: re.sub(' +',' ', x)) #quito los espacios de mas
twt_data_without_error["header"] = twt_data_without_error["header"].apply(lambda s: re.sub(r'\b\w{1,2}\b', '', s)) #quito las palabras cortas
twt_data_without_error

In [None]:
twt_data_without_error['header_splited'] = twt_data_without_error['header'].str.split(' ')
twt_data_without_error

In [None]:
list_stopwords = ["the","and","you","for","that","with","this","from","are","have","was","like","but","just","not","your",\
                 "all","after","will","can","has","when","they","get","new","now","what","amp","","http","https","out",\
                 "more","about","how","there","don","over","into","than","would","been","her","who","were","some","still",\
                 "his","back","why","them","got","know","had","see","going","our","htt","via","nws","wait",\
                 "hwy","top","both","hah","haha","getting","far","near", "video", "unavailable", "cuenta", "suspendida"]
def count_words(word, count):
    if(word in list_stopwords):
        return
    if (word in count):
        count[word] = count[word] + 1
        return
    count[word] = 1

In [None]:
total_word_count = {}
twt_data_without_error['header_splited'].map(
    lambda x: [count_words(word, total_word_count) for word in x]
)
total_word_count

In [None]:
disaster_word_count = {}
twt_data_without_error.loc[twt_data_without_error['target'] == 1,'header_splited'].map(
    lambda x: [count_words(word, disaster_word_count) for word in x] if (x != None) else None
)
disaster_word_count

In [None]:
normal_word_count = {}
twt_data_without_error.loc[twt_data_without_error['target'] == 0,'header_splited'].map(
    lambda x: [count_words(word, normal_word_count) for word in x] if (x != None) else None
)
normal_word_count

In [None]:
total_header_words = pd.DataFrame(list(total_word_count.items()), index=range(len(total_word_count.items())),\
                                  columns=['word', 'count'])
disaster_header_words = pd.DataFrame(list(disaster_word_count.items()), index=range(len(disaster_word_count.items())),\
                                  columns=['word', 'count'])
normal_header_words = pd.DataFrame(list(normal_word_count.items()), index=range(len(normal_word_count.items())),\
                                  columns=['word', 'count'])

In [None]:
def disaster_color_fun(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    pot = - (4/font_size)
    percentage = pow(10, pot)
    return "hsl(22, %d%%, 48%%)" % (100 * percentage)

In [None]:
def normal_color_fun(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    pot = - (4/font_size)
    percentage = pow(10, pot)
    return "hsl(166, %d%%, 35%%)" % (100 * percentage)

In [None]:
normal_color

In [None]:
twt_data_with_links

In [None]:
twt_data_with_links_b.columns = [x+'_b' for x in twt_data_with_links_b.columns]

In [None]:
aux = twt_data_with_links_b.join(twt_data_with_links)[:][['id','error', 'id_b', 'error_b']]
aux.head()

In [None]:
aux['error'] = aux['error'].fillna(0)
aux['error_b'] = aux['error_b'].fillna(0)
aux.head()

In [None]:
twt_data_with_links_b.info()

In [None]:
twt_data_with_links.info()

In [None]:
aux[(aux['error'] == 0) & (aux['error_b'] != 0)]

In [None]:
twt_data_with_links.loc[23]['page']