In [15]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import os
pd.set_option('max_colwidth', 400)

# Préprocessing du texte

accents, contractions, lowercase, newlines, lemmatization, stemming

In [16]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
def text_preprocessing(serie):
    
    # lowercase
    serie = serie.map(lambda x: x.lower())   

    # remove extra newlines
    serie = serie.map(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))

    # remove @tag
    serie = serie.map(lambda x: re.sub(r'@[A-Za-z0-9]+','', x))

    # remove URL
    serie = serie.map(lambda x: re.sub('https?://[A-Za-z0-9./]+','', x))

    # remove hashtag and numbers
    serie = serie.map(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # tokenization
    serie = serie.map(word_tokenize)

    # remove stop words
    stop_words = set(stopwords.words('english'))
    serie = serie.map(lambda x: [word for word in x if word not in stop_words])

    # lemmatization    
    serie = serie.map(nltk.tag.pos_tag)
    serie = serie.map(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

    wordnet_lemmatizer = WordNetLemmatizer()
    serie = serie.map(lambda x: [wordnet_lemmatizer.lemmatize(word, tag) for (word, tag) in x])

    serie = serie.map(lambda x: ' '.join(word for word in x))

    return serie

# Sentiment140
## Téléchargement de la base Sentiment140

In [2]:
import requests
import tempfile
import zipfile

url, destname = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip', 'sentiment140'
temporary_location = "temp"

def download_unzip(url, dirname = tempfile.gettempdir(), destname = "file"):
    myfile = requests.get(url)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    open(os.path.join(dirname, destname + '.zip'), 'wb').write(myfile.content)
    with zipfile.ZipFile(os.path.join(dirname, destname + '.zip'), 'r') as zip_ref:
        zip_ref.extractall(os.path.join(dirname, destname))
        
download_unzip(url, dirname=temporary_location, destname=destname)

In [9]:
trainfile = os.path.join(temporary_location, destname, "training.1600000.processed.noemoticon.csv")
# testfile = os.path.join(temporary_location, destname, "testdata.manual.2009.06.14.csv")
columns = ['sentiment','id','date','query_string','user','text']

In [10]:
# Import train data set
df = pd.read_csv(trainfile,
                 header=None, 
                 names=columns, 
                 encoding='latin-1')
df = df.sample(100000)
df

Unnamed: 0,sentiment,id,date,query_string,user,text
1325893,4,2015186894,Wed Jun 03 03:57:47 PDT 2009,NO_QUERY,Neta_Lifshitz,"Funny Video, worth a listen if you got a minut..."
346430,0,2016155708,Wed Jun 03 06:16:20 PDT 2009,NO_QUERY,beatty_2002,"I'm beginning to slow down now, tired"
863698,4,1677152268,Sat May 02 00:41:15 PDT 2009,NO_QUERY,Henwii,thought I had lost this tshirt (which I love) ...
734981,0,2264677911,Sun Jun 21 04:37:03 PDT 2009,NO_QUERY,DiscoverNicole,"Still buzzing from last night, but now im have..."
1265463,4,1999454361,Mon Jun 01 20:14:46 PDT 2009,NO_QUERY,crisbeee,@BlueLightTech welcome back to the twittervers...
...,...,...,...,...,...,...
758314,0,2295685410,Tue Jun 23 08:14:45 PDT 2009,NO_QUERY,polarna10,@shakirayshakira I hate you color me green wi...
936279,4,1792961448,Thu May 14 00:51:23 PDT 2009,NO_QUERY,Crichton_Kicks,"@_Flik_ Ah, yeah I suppose, as long as you don..."
1173397,4,1980758885,Sun May 31 07:36:01 PDT 2009,NO_QUERY,serenawet,@DreamNetJade Had a blast last night with you ...
883116,4,1686075988,Sun May 03 03:37:46 PDT 2009,NO_QUERY,MizSadittyFancy,@Jawannsample ummmm sum egg whites and turkey ...


## Préprocessing Sentiment140

In [20]:
df[['text']].head(10)

Unnamed: 0,text
1325893,"Funny Video, worth a listen if you got a minute to spare http://bit.ly/19Jf7Z"
346430,"I'm beginning to slow down now, tired"
863698,thought I had lost this tshirt (which I love) ~2 yrs ago. Tony just found &amp; returned it @ West Covina http://loopt.us/M0_NCg.t
734981,"Still buzzing from last night, but now im have post gig depression"
1265463,@BlueLightTech welcome back to the twitterverse. How r u doing darling? Hope u have a wonderful day! Take Care!
1060723,@fleckman youre outtahere pal!
1246418,"@gyratory, @korruptor: congratulations on the Crackdown 2 announcement! Here's hoping Realtime Worlds aren't gonna come slash yr tires."
412698,Man I am really trying to get into Harper's Island but I just can't. That NHL standley cup game on NBC is a blow out right now...
443846,"@Julesxv you might be able to download the drivers from the brand website, they may not be fully supportive of Linux though"
469672,@cocoward ARE YOU SHITTING ME??? who who who how how did you find out??? AND NO im not lying slutttt you look beautiful


In [21]:
%%time
df.text = text_preprocessing(df.text)

Wall time: 1min 32s


In [22]:
df[['text']].head(10)

Unnamed: 0,text
1325893,funny video worth listen get minute spare
346430,begin slow tire
863698,thought lose tshirt love yr ago tony find amp return west covina ncg
734981,still buzz last night im post gig depression
1265463,welcome back twitterverse r u darling hope u wonderful day take care
1060723,youre outtahere pal
1246418,congratulation crackdown announcement hop realtime world gon na come slash yr tire
412698,man really try get harper island nhl standley cup game nbc blow right
443846,might able download driver brand website may fully supportive linux though
469672,shit find im lie slutttt look beautiful


In [23]:
%%time
# Sauvegarde
df.to_pickle(os.path.join("data", "sentiment140", "train.bz2"))

Wall time: 884 ms


# Base Twitter webscrappée
## Chargement de la base Sentiment140

In [8]:
df = pd.read_pickle(os.path.join("data", "web", "web_parse.bz2"))
df

Unnamed: 0,search,author,time,url,text,lang,reply
0,biden,Frank Braswell,2012-12-05 23:41:12,/FBRASWELL/status/276471312489254912,Gotta love Joe! RT @rorycooper: VP Biden says we don't need new Navy ships because new ones r more powerful. Did oceans also get smaller?,en,False
1,biden,GLENN,2012-12-05 23:40:46,/GCGATOR24/status/276471203290554369,"BIDEN'S HAIRPLUGS EXEMPT: IRS finalizes new tax for medical devices in ObamaCare law... http://drudge.tw/VnAA9L"" @DrMartyFox @tnlawgirl",en,False
2,biden,cydney,2012-12-05 23:39:11,/cydney/status/276470805855076353,"Omg Keisha... ""@Angelic_kiss: Listening to ""Put It Down"" by Joe Budden and @Juicy_Keish calls him Joe Biden 😒😂😂😂""",en,False
3,biden,GLENN,2012-12-05 23:38:10,/GCGATOR24/status/276470550338080768,"BIDEN'S HAIRPLUGS EXEMPT: @DRUDGE_REPORT: IRS finalizes new tax for medical devices in ObamaCare law... http://drudge.tw/VnAA9L""",en,False
4,biden,Emily Greenhouse,2012-12-05 23:37:28,/emserre/status/276470372902264832,"As Obama jotted while Biden blathered: ""Shoot. Me. Now.""—Jane Mayer's 10 rules for surviving Washington Holiday Parties http://nyr.kr/Vnyuqi",en,False
...,...,...,...,...,...,...,...
4040,trump,Paolo,2012-12-05 02:36:59,/PaoloTorress/status/276153163374030849,Take over the world when im on my donald trump shit look at all this money... Aint that some shit!,en,False
4041,trump,♡ N. Doll ♡,2012-12-05 02:36:50,/North_Doll/status/276153125289750528,@BubbaWallace @chaseelliott @RossKenseth lmao does it trump the first one.#gottagetit,en,False
4042,trump,Arista Ellis,2012-12-05 02:36:43,/Arista817/status/276153098207117312,Jerry Trump admits in April 2012 to lying at 2005 trial of Ryan Ferguson http://tinyurl.com/bzryrfa,en,False
4043,trump,David Thomas McKenzie,2012-12-05 02:36:39,/davidtmckenzie/status/276153079391481856,@realDonaldTrump \n\nLove ritz Carleton. Great place. Never stayed at Trump. I have been in the tower in NyC,en,False


In [9]:
def filtre(text):
    lower_text = text.lower()
    if "trumps" in lower_text:
        if "trump" in lower_text.replace("trumps", ""):
            return True
        else:
            return False
    else:
        return True

La recherche du mot `trump` dans Twitter fournit aussi des tweets comprenants le mot `trumps` (issu du nom ou verbe `trump`), on supprime donc ces tweets.

In [12]:
df = df[(df.search == 'biden') | (df.search == "trump") & (df.text.apply(filtre))]
df.shape

(3356, 7)

In [13]:
df[['text']].head(10)

Unnamed: 0,text
0,Gotta love Joe! RT @rorycooper: VP Biden says we don't need new Navy ships because new ones r more powerful. Did oceans also get smaller?
1,"BIDEN'S HAIRPLUGS EXEMPT: IRS finalizes new tax for medical devices in ObamaCare law... http://drudge.tw/VnAA9L"" @DrMartyFox @tnlawgirl"
2,"Omg Keisha... ""@Angelic_kiss: Listening to ""Put It Down"" by Joe Budden and @Juicy_Keish calls him Joe Biden 😒😂😂😂"""
3,"BIDEN'S HAIRPLUGS EXEMPT: @DRUDGE_REPORT: IRS finalizes new tax for medical devices in ObamaCare law... http://drudge.tw/VnAA9L"""
4,"As Obama jotted while Biden blathered: ""Shoot. Me. Now.""—Jane Mayer's 10 rules for surviving Washington Holiday Parties http://nyr.kr/Vnyuqi"
5,My Gene Splicer is making a golem using genes from Billy Mays and Joe Biden. It shouts while projectile vomiting someone else's slime.
6,Hillary; Biden; Shumer; etc. etc. With so many incredible egos I can hardly wait for the 2016 primaries. Maybe even Moochelle.\nLOL
7,"“@PolarCoug: Hey, if Iran wants one of our drones so badly, let’s give them Joe Biden.” #tcot"
8,@megynkelly civil rights in Biden country http://troubleindover.blogspot.com
9,@sam_white90: Just gave a woman at krogers the right away to a parking spot and then saw her Obama-Biden bumper sticker 😒\n#gimmeback


In [18]:
%%time
df.text = text_preprocessing(df.text)

Wall time: 4.87 s


In [19]:
df[['text']].head(10)

Unnamed: 0,text
0,get ta love joe rt vp biden say need new navy ship new one r powerful ocean also get small
1,biden hairplugs exempt irs finalize new tax medical device obamacare law
2,omg keisha kiss listening put joe budden keish call joe biden
3,biden hairplugs exempt report irs finalize new tax medical device obamacare law
4,obama jot biden blather shoot jane mayer rule survive washington holiday party
5,gene splicer make golem use gene billy may joe biden shout projectile vomit someone else slime
6,hillary biden shumer etc etc many incredible ego hardly wait primary maybe even moochelle lol
7,hey iran want one drone badly let give joe biden tcot
8,civil right biden country
9,white give woman krogers right away park spot saw obama biden bumper sticker gimmeback


In [20]:
%%time
# Sauvegarde

Wall time: 97.5 ms
