In [107]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen, HTTPError, URLError
from urllib.parse import urlparse
import re
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from bs4 import BeautifulSoup




%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
twt_data = pd.read_csv('../DataSets/twt_train.csv')

In [3]:
twt_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
twt_data.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.93,0.43
std,3137.12,0.5
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [5]:
twt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 208.2+ KB


In [6]:
def obtain_link(twt):
    url = re.search("(?P<url>https?://[^\s]+)", twt)
    return url.group("url") if url else np.NaN  

In [7]:
twt_data['link'] = twt_data['text'].map(obtain_link)
twt_data

Unnamed: 0,id,keyword,location,text,target,link
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,
1,4,,,Forest fire near La Ronge Sask. Canada,1,
2,5,,,All residents asked to 'shelter in place' are ...,1,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ
7611,10872,,,Police investigating after an e-bike collided ...,1,


In [8]:
twt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
 5   link      3971 non-null   object
dtypes: int64(2), object(4)
memory usage: 238.0+ KB


In [9]:
def isShorted(link):
    if link != link: return np.NaN 
    return 't.co' in str(link)

In [10]:
twt_data['link_shorted'] = twt_data['link'].map(isShorted)


In [11]:
twt_data

Unnamed: 0,id,keyword,location,text,target,link,link_shorted
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,,
1,4,,,Forest fire near La Ronge Sask. Canada,1,,
2,5,,,All residents asked to 'shelter in place' are ...,1,,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,,
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,,
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ,True
7611,10872,,,Police investigating after an e-bike collided ...,1,,


In [12]:
twt_data_with_links = twt_data.dropna(subset=['link'])
twt_data_with_links

Unnamed: 0,id,keyword,location,text,target,link,link_shorted
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,http://t.co/lHYXEOHY6C,True
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,http://t.co/2nndBGwyEi,True
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1,http://t.co/3ImaomknnA,True
...,...,...,...,...,...,...,...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1,http://t.co/3SICroAaNz,True
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ,True


In [13]:
twt_data_with_links = twt_data_with_links[twt_data_with_links['link_shorted']]
twt_data_with_links

Unnamed: 0,id,keyword,location,text,target,link,link_shorted
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,http://t.co/lHYXEOHY6C,True
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,http://t.co/2nndBGwyEi,True
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1,http://t.co/3ImaomknnA,True
...,...,...,...,...,...,...,...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1,http://t.co/3SICroAaNz,True
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ,True


In [14]:
twt_data_with_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3968 entries, 31 to 7612
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            3968 non-null   int64 
 1   keyword       3948 non-null   object
 2   location      2714 non-null   object
 3   text          3968 non-null   object
 4   target        3968 non-null   int64 
 5   link          3968 non-null   object
 6   link_shorted  3968 non-null   object
dtypes: int64(2), object(5)
memory usage: 170.5+ KB


In [15]:
twt_data_with_links['real_link'] = 'np.NaN'
twt_data_with_links['error'] = 'np.NaN'
twt_data_with_links['page'] = 'np.NaN'
twt_data_with_links = twt_data_with_links.reset_index()
twt_data_with_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3968 entries, 31 to 7612
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            3968 non-null   int64 
 1   keyword       3948 non-null   object
 2   location      2714 non-null   object
 3   text          3968 non-null   object
 4   target        3968 non-null   int64 
 5   link          3968 non-null   object
 6   link_shorted  3968 non-null   object
 7   real_link     3968 non-null   object
 8   error         3968 non-null   object
 9   page          3968 non-null   object
dtypes: int64(2), object(8)
memory usage: 217.0+ KB


In [33]:
def obtain_real_link(index, url):
    try:
        with urlopen(url, timeout = 20) as response:
            actual_url = response.geturl()
            error = np.NaN
            page = response.read()
    except HTTPError as e:
        actual_url = e.url
        error = e
        page = np.NaN
    except URLError as e:
        actual_url = url
        error = e
        page = np.NaN
    except:
        actual_url = url
        error = 'OTHER_ERROR'
        page = np.NaN
    return (index, actual_url, error, page)

In [17]:
def doWork(row):
    return obtain_real_link(row[0], row[1]['link'])

In [61]:
twt_data_with_links['error_code'] = twt_data_with_links['error'].map(lambda x: x.status if isinstance(x, HTTPError) else np.NaN)

In [63]:
twt_data_with_links.to_csv('../DataSets/twt_with_links.csv')

In [78]:
twt_data_with_links.head()

Unnamed: 0,id,keyword,location,text,target,link,link_shorted,real_link,error,page,error_code
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,http://t.co/lHYXEOHY6C,True,https://twitter.com/dannypurewal/status/629284...,HTTP Error 404: Not Found,,404.0
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True,https://twitter.com/Valis_Ablaze/status/629274...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,http://t.co/2nndBGwyEi,True,https://africanbaze.com/breaking-newsnigeria-f...,HTTP Error 404: Not Found,,404.0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True,https://twitter.com/AnyOtherAnnaK/status/62919...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1,http://t.co/3ImaomknnA,True,http://yabaleftonline.com/2015/08/inec-office-...,HTTP Error 403: Forbidden,,403.0


In [88]:
twt_data_with_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3968 entries, 31 to 7612
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            3968 non-null   int64  
 1   keyword       3948 non-null   object 
 2   location      2714 non-null   object 
 3   text          3968 non-null   object 
 4   target        3968 non-null   int64  
 5   link          3968 non-null   object 
 6   link_shorted  3968 non-null   object 
 7   real_link     3968 non-null   object 
 8   error         1758 non-null   object 
 9   page          2214 non-null   object 
 10  error_code    1386 non-null   float64
dtypes: float64(1), int64(2), object(8)
memory usage: 376.0+ KB


In [82]:
url = twt_data_with_links.loc[31, 'real_link']

In [95]:
twt_data_with_links['site'] = twt_data_with_links['real_link'].map(lambda x: urlparse(x).netloc)
twt_data_with_links

Unnamed: 0,id,keyword,location,text,target,link,link_shorted,real_link,error,page,error_code,site
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,http://t.co/lHYXEOHY6C,True,https://twitter.com/dannypurewal/status/629284...,HTTP Error 404: Not Found,,404.00,twitter.com
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True,https://twitter.com/Valis_Ablaze/status/629274...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,http://t.co/2nndBGwyEi,True,https://africanbaze.com/breaking-newsnigeria-f...,HTTP Error 404: Not Found,,404.00,africanbaze.com
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True,https://twitter.com/AnyOtherAnnaK/status/62919...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1,http://t.co/3ImaomknnA,True,http://yabaleftonline.com/2015/08/inec-office-...,HTTP Error 403: Forbidden,,403.00,yabaleftonline.com
...,...,...,...,...,...,...,...,...,...,...,...,...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True,https://www.reuters.com/article/us-saudi-secur...,,b'<!--[if !IE]> This has been served from cach...,,www.reuters.com
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1,http://t.co/3SICroAaNz,True,http://t.co/3SICroAaNz,<urlopen error [Errno -2] Name or service not ...,,,t.co
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True,https://gizmodo.com/two-giant-cranes-holding-a...,,"b'<!DOCTYPE html><html lang=""en-us"" data-react...",,gizmodo.com
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,http://t.co/zDtoyd8EbJ,True,https://earthquake.usgs.gov/earthquakes/eventp...,,"b'<!doctype html>\n<html lang=""en"">\n\n<head>\...",,earthquake.usgs.gov


In [96]:
twt_data_with_links['site'].value_counts()

twitter.com                    514
t.co                           398
www.youtube.com                164
www.ebay.com                    89
www.facebook.com                78
                              ... 
www.ky3.com                      1
dfw.cbslocal.com                 1
www.afr.com                      1
www.eleconomistaamerica.com      1
www.minews26.com                 1
Name: site, Length: 1205, dtype: int64

In [100]:
twt_data_with_links.loc[twt_data_with_links['error'] != twt_data_with_links['error'], 'site'].value_counts()

twitter.com                                 399
www.youtube.com                             164
www.facebook.com                             63
www.bbc.co.uk                                62
www.latimes.com                              40
                                           ... 
napavalleyregister.com                        1
trove.nla.gov.au                              1
project-middle-grade-mayhem.blogspot.com      1
agora.ex.nii.ac.jp                            1
droughtmonitor.unl.edu                        1
Name: site, Length: 700, dtype: int64

In [101]:
twt_data_with_links['target'].value_counts()

1    2171
0    1797
Name: target, dtype: int64

In [102]:
twt_data_with_links.loc[twt_data_with_links['error'] != twt_data_with_links['error'], 'target'].value_counts()

1    1226
0     984
Name: target, dtype: int64

In [103]:
twt_data_with_links_without_error = twt_data_with_links.loc[twt_data_with_links['error'] != twt_data_with_links['error']]
twt_data_with_links_without_error

Unnamed: 0,id,keyword,location,text,target,link,link_shorted,real_link,error,page,error_code,site
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True,https://twitter.com/Valis_Ablaze/status/629274...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True,https://twitter.com/AnyOtherAnnaK/status/62919...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com
38,56,ablaze,,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ...,1,http://t.co/wDUEaj8Q4J,True,https://www.youtube.com/watch?v=yG08hLEWzyM,,"b'<!DOCTYPE html><html lang=""en"" data-cast-api...",,www.youtube.com
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,http://t.co/rOI2NSmEJJ,True,https://www.freewebcamsex.me/2015-02-02-taking...,,b'\r\n<!DOCTYPE html><html><head><title>Hot Ch...,,www.freewebcamsex.me
44,64,ablaze,,I wanted to set Chicago ablaze with my preachi...,0,http://t.co/o9qknbfOFX,True,https://twitter.com/RevDominic/status/62927230...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com
...,...,...,...,...,...,...,...,...,...,...,...,...
7601,10859,,,#breaking #LA Refugio oil spill may have been ...,1,http://t.co/5ueCmcv2Pk,True,https://www.latimes.com/local/lanow/la-me-ln-r...,,"b'<!DOCTYPE html>\n<html class=""ArticlePage"" l...",,www.latimes.com
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1,http://t.co/EYSVvzA7Qm,True,https://www.youtube.com/watch?v=yG08hLEWzyM&3,,"b'<!DOCTYPE html><html lang=""en"" data-cast-api...",,www.youtube.com
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True,https://www.reuters.com/article/us-saudi-secur...,,b'<!--[if !IE]> This has been served from cach...,,www.reuters.com
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True,https://gizmodo.com/two-giant-cranes-holding-a...,,"b'<!DOCTYPE html><html lang=""en-us"" data-react...",,gizmodo.com


In [105]:
twt_data_with_links_without_error.loc[twt_data_with_links_without_error['target'] == 1, 'site'].value_counts()

twitter.com                        188
www.bbc.co.uk                       59
www.youtube.com                     54
www.latimes.com                     38
cue.li                              30
                                  ... 
hope-international.blogspot.com      1
ow.ly                                1
www.breakingnews.ie                  1
www.thestar.com.my                   1
www.forharriet.com                   1
Name: site, Length: 427, dtype: int64

In [106]:
twt_data_with_links_without_error.loc[twt_data_with_links_without_error['target'] == 0, 'site'].value_counts()

twitter.com             211
www.youtube.com         110
www.facebook.com         35
www.instagram.com        28
youthsnews.com.au        25
                       ... 
www.loudersound.com       1
www.devex.com             1
beatsradio.ca             1
www.gq.com                1
9jacruz.blogspot.com      1
Name: site, Length: 356, dtype: int64

In [115]:
twt_data_with_links_without_error.loc[7601, 'real_link']

'https://www.latimes.com/local/lanow/la-me-ln-refugio-oil-spill-projected-company-says-20150805-story.html?utm_source=twitterfeed&utm_medium=twitter'

In [112]:
soup = BeautifulSoup(twt_data_with_links_without_error.loc[7601, 'page'], "lxml")

In [113]:
soup.h1.string


'\n        Refugio oil spill may have been costlier, bigger than projected\n    '

In [118]:
def get_the_header(page):
    soup = BeautifulSoup(page, "lxml")
    return soup.h1.string if soup.h1 else ''

In [119]:
twt_data_with_links_without_error['header'] = twt_data_with_links_without_error['page'].map(get_the_header)
twt_data_with_links_without_error

Unnamed: 0,id,keyword,location,text,target,link,link_shorted,real_link,error,page,error_code,site,header
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True,https://twitter.com/Valis_Ablaze/status/629274...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com,
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True,https://twitter.com/AnyOtherAnnaK/status/62919...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com,
38,56,ablaze,,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ...,1,http://t.co/wDUEaj8Q4J,True,https://www.youtube.com/watch?v=yG08hLEWzyM,,"b'<!DOCTYPE html><html lang=""en"" data-cast-api...",,www.youtube.com,\n This video is unavailable.\n\n...
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,http://t.co/rOI2NSmEJJ,True,https://www.freewebcamsex.me/2015-02-02-taking...,,b'\r\n<!DOCTYPE html><html><head><title>Hot Ch...,,www.freewebcamsex.me,Real Webcam Babes Performing in Hot Live Cam S...
44,64,ablaze,,I wanted to set Chicago ablaze with my preachi...,0,http://t.co/o9qknbfOFX,True,https://twitter.com/RevDominic/status/62927230...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7601,10859,,,#breaking #LA Refugio oil spill may have been ...,1,http://t.co/5ueCmcv2Pk,True,https://www.latimes.com/local/lanow/la-me-ln-r...,,"b'<!DOCTYPE html>\n<html class=""ArticlePage"" l...",,www.latimes.com,\n Refugio oil spill may have been cost...
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1,http://t.co/EYSVvzA7Qm,True,https://www.youtube.com/watch?v=yG08hLEWzyM&3,,"b'<!DOCTYPE html><html lang=""en"" data-cast-api...",,www.youtube.com,\n This video is unavailable.\n\n...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True,https://www.reuters.com/article/us-saudi-secur...,,b'<!--[if !IE]> This has been served from cach...,,www.reuters.com,Suicide bomber kills 15 in Saudi security site...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True,https://gizmodo.com/two-giant-cranes-holding-a...,,"b'<!DOCTYPE html><html lang=""en-us"" data-react...",,gizmodo.com,Two giant cranes holding a bridge collapse int...


In [121]:
twt_data_with_links_without_error['header'] = twt_data_with_links_without_error['header'].str.replace('\n', '')
twt_data_with_links_without_error['header'] = twt_data_with_links_without_error['header'].str.replace(',', '')
twt_data_with_links_without_error['header'] = twt_data_with_links_without_error['header'].str.replace('.', '')
twt_data_with_links_without_error['header'] = twt_data_with_links_without_error['header'].str.replace(':', '')
twt_data_with_links_without_error

Unnamed: 0,id,keyword,location,text,target,link,link_shorted,real_link,error,page,error_code,site,header
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,http://t.co/YAo1e0xngw,True,https://twitter.com/Valis_Ablaze/status/629274...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com,
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,http://t.co/qqsmshaJ3N,True,https://twitter.com/AnyOtherAnnaK/status/62919...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com,
38,56,ablaze,,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ...,1,http://t.co/wDUEaj8Q4J,True,https://www.youtube.com/watch?v=yG08hLEWzyM,,"b'<!DOCTYPE html><html lang=""en"" data-cast-api...",,www.youtube.com,This video is unavailable
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,http://t.co/rOI2NSmEJJ,True,https://www.freewebcamsex.me/2015-02-02-taking...,,b'\r\n<!DOCTYPE html><html><head><title>Hot Ch...,,www.freewebcamsex.me,Real Webcam Babes Performing in Hot Live Cam S...
44,64,ablaze,,I wanted to set Chicago ablaze with my preachi...,0,http://t.co/o9qknbfOFX,True,https://twitter.com/RevDominic/status/62927230...,,"b'<!DOCTYPE html>\n<html lang=""es"" data-scribe...",,twitter.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7601,10859,,,#breaking #LA Refugio oil spill may have been ...,1,http://t.co/5ueCmcv2Pk,True,https://www.latimes.com/local/lanow/la-me-ln-r...,,"b'<!DOCTYPE html>\n<html class=""ArticlePage"" l...",,www.latimes.com,Refugio oil spill may have been costli...
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1,http://t.co/EYSVvzA7Qm,True,https://www.youtube.com/watch?v=yG08hLEWzyM&3,,"b'<!DOCTYPE html><html lang=""en"" data-cast-api...",,www.youtube.com,This video is unavailable
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,http://t.co/nF4IculOje,True,https://www.reuters.com/article/us-saudi-secur...,,b'<!--[if !IE]> This has been served from cach...,,www.reuters.com,Suicide bomber kills 15 in Saudi security site...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,http://t.co/STfMbbZFB5,True,https://gizmodo.com/two-giant-cranes-holding-a...,,"b'<!DOCTYPE html><html lang=""en-us"" data-react...",,gizmodo.com,Two giant cranes holding a bridge collapse int...


In [124]:
twt_data_with_links_without_error['header_splited'] = twt_data_with_links_without_error['header'].str.split(' ')

In [132]:
def count_words(word, count):
    if(word == None or word == ''):
        return
    if (word in count):
        count[word] = count[word] + 1
        return
    count[word] = 1
        

In [142]:
total_word_count = {}
twt_data_with_links_without_error['header_splited'].map(
    lambda x: [count_words(word, total_word_count) for word in x] if (x != None) else None
)
total_word_count

{'This': 152,
 'video': 163,
 'is': 173,
 'unavailable': 140,
 'Real': 5,
 'Webcam': 2,
 'Babes': 2,
 'Performing': 2,
 'in': 204,
 'Hot': 4,
 'Live': 3,
 'Cam': 2,
 'Shows': 2,
 'How': 7,
 'the': 97,
 'West': 7,
 'was': 5,
 'burned': 3,
 'Thousands': 2,
 'of': 139,
 'wildfires': 4,
 'ablaze': 2,
 'California': 35,
 'alone': 2,
 '\r': 12,
 'Elyria': 1,
 'and': 51,
 'Lorain': 1,
 'County': 7,
 'Newspaper': 1,
 '|': 6,
 '%Title%\r': 1,
 'Facebook': 63,
 'San': 1,
 'Francisco': 1,
 'Traffic': 3,
 'Report': 2,
 'Getting': 2,
 'a': 53,
 'personal': 1,
 'injury': 2,
 'lawyer': 1,
 'The': 38,
 'Sleep': 1,
 'Blog': 8,
 'Big': 1,
 'Rig': 1,
 'Overturns': 1,
 'On': 6,
 'Fort': 1,
 'Worth': 1,
 'Interstate': 1,
 'Columbus': 2,
 'Aashiqui': 1,
 'Actress': 1,
 'Anu': 1,
 'Aggarwal': 1,
 'Her': 1,
 'Near-Fatal': 1,
 'Accident': 1,
 '320': 7,
 '[IR]': 7,
 'ICEMOON': 7,
 '[AFTERSHOCK]': 7,
 'These': 3,
 'five-minute': 1,
 'daily': 1,
 'habits': 1,
 'will': 4,
 'seriously': 1,
 'improve': 1,
 'your': 7

In [136]:
disaster_word_count = {}
twt_data_with_links_without_error.loc[twt_data_with_links_without_error['target'] == 1,'header_splited'].map(
    lambda x: [count_words(word, disaster_word_count) for word in x] if (x != None) else None
)
disaster_word_count

{'This': 54,
 'video': 67,
 'is': 71,
 'unavailable': 46,
 'How': 4,
 'the': 74,
 'West': 6,
 'was': 5,
 'burned': 3,
 'Thousands': 2,
 'of': 96,
 'wildfires': 4,
 'ablaze': 2,
 'in': 162,
 'California': 33,
 'alone': 2,
 '\r': 3,
 'Elyria': 1,
 'and': 29,
 'Lorain': 1,
 'County': 6,
 'Newspaper': 1,
 '|': 6,
 '%Title%\r': 1,
 'San': 1,
 'Francisco': 1,
 'Traffic': 3,
 'Report': 2,
 'Facebook': 26,
 'Big': 1,
 'Rig': 1,
 'Overturns': 1,
 'On': 4,
 'Fort': 1,
 'Worth': 1,
 'Interstate': 1,
 'Columbus': 1,
 'Aashiqui': 1,
 'Actress': 1,
 'Anu': 1,
 'Aggarwal': 1,
 'Her': 1,
 'Near-Fatal': 1,
 'Accident': 1,
 'Reunion': 11,
 'Island': 7,
 'wing': 11,
 'debris': 12,
 'from': 43,
 'Flight': 10,
 '370': 9,
 'Malaysia': 9,
 'prime': 6,
 'minister': 6,
 'says': 22,
 'no': 16,
 'longer': 15,
 'available': 5,
 'because': 5,
 'YouTube': 5,
 'account': 5,
 'associated': 5,
 'with': 12,
 'this': 6,
 'has': 6,
 'been': 28,
 'terminated': 5,
 'Shropshire': 1,
 'Star': 2,
 'Twelve': 1,
 'feared': 5,
 

In [137]:
normal_word_count = {}
twt_data_with_links_without_error.loc[twt_data_with_links_without_error['target'] == 0,'header_splited'].map(
    lambda x: [count_words(word, normal_word_count) for word in x] if (x != None) else None
)
normal_word_count

{'Real': 3,
 'Webcam': 2,
 'Babes': 2,
 'Performing': 2,
 'in': 42,
 'Hot': 4,
 'Live': 2,
 'Cam': 2,
 'Shows': 2,
 'Facebook': 37,
 'This': 98,
 'video': 96,
 'is': 102,
 'unavailable': 94,
 'Getting': 2,
 'a': 18,
 'personal': 1,
 'injury': 2,
 'lawyer': 1,
 'The': 17,
 'Sleep': 1,
 'Blog': 6,
 '320': 7,
 '[IR]': 7,
 'ICEMOON': 7,
 '[AFTERSHOCK]': 7,
 'These': 1,
 'five-minute': 1,
 'daily': 1,
 'habits': 1,
 'will': 4,
 'seriously': 1,
 'improve': 1,
 'your': 3,
 'life': 1,
 'Aftershock': 1,
 'Protect': 1,
 'Yourself': 1,
 'and': 22,
 'Profit': 1,
 'the': 23,
 'Next': 2,
 'Global': 1,
 'Financial': 3,
 'Meltdown': 2,
 'by': 8,
 'Cindy': 1,
 'S': 1,
 'Spitzer': 1,
 'David': 2,
 'Wiedemer': 2,
 'Robert': 1,
 'A': 8,
 '(2011': 1,
 'Hardcover)': 1,
 'NY': 1,
 'EMTs': 1,
 'petition': 6,
 'for': 25,
 '$17': 1,
 'per': 1,
 'hour': 1,
 '‘minimum': 1,
 'wage’': 1,
 'Content': 14,
 'Armageddon': 2,
 'Payamps': 1,
 't1vc': 1,
 'Secret': 1,
 'Penis': 1,
 'Time': 2,
 'INFANTRY': 1,
 'Mens': 1,
 

In [140]:
{k: v for k, v in sorted(normal_word_count.items(), key=lambda item: item[1], reverse = True)}


{'is': 102,
 'This': 98,
 'video': 96,
 'unavailable': 94,
 'to': 63,
 'of': 43,
 'in': 42,
 'Facebook': 37,
 'Cuenta': 31,
 'suspendida': 31,
 '–': 31,
 'city': 26,
 'for': 25,
 'celebrity': 25,
 'news': 25,
 'guide': 25,
 'the': 23,
 'and': 22,
 'a': 18,
 'on': 18,
 'The': 17,
 'Content': 14,
 'no': 12,
 'Emergency': 11,
 'are': 10,
 'at': 10,
 'longer': 10,
 '\r': 9,
 'protection': 9,
 'by': 8,
 'A': 8,
 'FedEx': 8,
 'transport': 8,
 'bioterror': 8,
 'germs': 8,
 'wake': 8,
 'anthrax': 8,
 'lab': 8,
 'mishaps': 8,
 'with': 8,
 'under': 8,
 'Reddit': 8,
 'Now': 8,
 '320': 7,
 '[IR]': 7,
 'ICEMOON': 7,
 '[AFTERSHOCK]': 7,
 'from': 7,
 'Bolsos': 7,
 'y': 7,
 'Carteras': 7,
 'para': 7,
 'Mujer': 7,
 'In': 7,
 'Will': 7,
 'Quarantine': 7,
 'Offensive': 7,
 'Blog': 6,
 'petition': 6,
 'be': 6,
 '-': 6,
 'New': 6,
 '2015': 6,
 'new': 6,
 'about': 6,
 'X': 6,
 'Your': 6,
 'Bloomberg': 6,
 'an': 5,
 'US': 5,
 'Of': 5,
 'Summer': 5,
 'To': 5,
 'You': 5,
 'web': 5,
 'surfing': 5,
 'People': 5,

In [141]:
{k: v for k, v in sorted(disaster_word_count.items(), key=lambda item: item[1], reverse = True)}


{'in': 162,
 'of': 96,
 'to': 91,
 'the': 74,
 'is': 71,
 'video': 67,
 'This': 54,
 'unavailable': 46,
 'from': 43,
 'for': 35,
 'a': 35,
 'California': 33,
 'over': 32,
 'at': 31,
 'Hiroshima': 30,
 "Legionnaires'": 30,
 'cueli': 30,
 'and': 29,
 'been': 28,
 'outbreak': 28,
 'on': 27,
 'disease': 27,
 'Facebook': 26,
 'have': 26,
 'may': 24,
 'oil': 24,
 'Families': 24,
 'sue': 24,
 'Edinburgh': 24,
 'spill': 23,
 'than': 23,
 'says': 22,
 'projected': 22,
 'The': 21,
 'up': 21,
 'Refugio': 21,
 'costlier': 21,
 'bigger': 21,
 'Mediterranean': 20,
 '–': 19,
 'as': 18,
 'Atomic': 17,
 'no': 16,
 'by': 16,
 'news': 16,
 'city': 16,
 'longer': 15,
 'bomb': 15,
 'after': 15,
 'After': 15,
 'celebrity': 15,
 'guide': 15,
 '70': 14,
 'Japan': 14,
 'during': 14,
 'MSF': 14,
 "'We're": 14,
 'picking': 14,
 'bodies': 14,
 "water'": 14,
 'crisis': 13,
 'MH370': 13,
 'Years': 13,
 'With': 13,
 'memories': 13,
 'debris': 12,
 'with': 12,
 'To': 12,
 'Bombs': 12,
 'Reunion': 11,
 'wing': 11,
 'a

In [143]:
{k: v for k, v in sorted(total_word_count.items(), key=lambda item: item[1], reverse = True)}


{'in': 204,
 'is': 173,
 'video': 163,
 'to': 154,
 'This': 152,
 'unavailable': 140,
 'of': 139,
 'the': 97,
 'Facebook': 63,
 'for': 60,
 'a': 53,
 'and': 51,
 'from': 50,
 '–': 50,
 'on': 45,
 'Cuenta': 42,
 'suspendida': 42,
 'city': 42,
 'at': 41,
 'news': 41,
 'celebrity': 40,
 'guide': 40,
 'The': 38,
 'California': 35,
 'over': 35,
 'been': 30,
 'Hiroshima': 30,
 "Legionnaires'": 30,
 'cueli': 30,
 'no': 28,
 'have': 28,
 'outbreak': 28,
 'disease': 27,
 'may': 26,
 'longer': 25,
 'than': 25,
 'by': 24,
 'oil': 24,
 'Families': 24,
 'sue': 24,
 'Edinburgh': 24,
 'spill': 23,
 'says': 22,
 'as': 22,
 'projected': 22,
 'up': 21,
 'Refugio': 21,
 'costlier': 21,
 'bigger': 21,
 'with': 20,
 'Mediterranean': 20,
 'FedEx': 18,
 'transport': 18,
 'bioterror': 18,
 'germs': 18,
 'wake': 18,
 'anthrax': 18,
 'lab': 18,
 'mishaps': 18,
 'To': 17,
 'After': 17,
 'Atomic': 17,
 'after': 17,
 'Content': 16,
 'US': 16,
 'during': 16,
 'A': 15,
 'are': 15,
 'In': 15,
 'bomb': 15,
 'Emergency