In [21]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import os
pd.set_option('max_colwidth', 400)

# Préprocessing du texte

accents, contractions, lowercase, newlines, lemmatization, stemming

In [22]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [23]:
def text_preprocessing(serie):
    
    # lowercase
    serie = serie.map(lambda x: x.lower())   

    # remove extra newlines
    serie = serie.map(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))

    # remove @tag
    serie = serie.map(lambda x: re.sub(r'@[A-Za-z0-9]+','', x))

    # remove URL
    serie = serie.map(lambda x: re.sub('https?://[A-Za-z0-9./]+','', x))

    # remove hashtag and numbers
    serie = serie.map(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # tokenization
    serie = serie.map(word_tokenize)

    # remove stop words
    stop_words = set(stopwords.words('english'))
    serie = serie.map(lambda x: [word for word in x if word not in stop_words])

    # lemmatization    
    serie = serie.map(nltk.tag.pos_tag)
    serie = serie.map(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

    wordnet_lemmatizer = WordNetLemmatizer()
    serie = serie.map(lambda x: [wordnet_lemmatizer.lemmatize(word, tag) for (word, tag) in x])

    serie = serie.map(lambda x: ' '.join(word for word in x))

    return serie

# Sentiment140
## Téléchargement de la base Sentiment140

In [24]:
import requests
import tempfile
import zipfile

url, destname = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip', 'sentiment140'
temporary_location = "temp"

def download_unzip(url, dirname = tempfile.gettempdir(), destname = "file"):
    myfile = requests.get(url)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    open(os.path.join(dirname, destname + '.zip'), 'wb').write(myfile.content)
    with zipfile.ZipFile(os.path.join(dirname, destname + '.zip'), 'r') as zip_ref:
        zip_ref.extractall(os.path.join(dirname, destname))
        
download_unzip(url, dirname=temporary_location, destname=destname)

In [25]:
trainfile = os.path.join(temporary_location, destname, "training.1600000.processed.noemoticon.csv")
# testfile = os.path.join(temporary_location, destname, "testdata.manual.2009.06.14.csv")
columns = ['sentiment','id','date','query_string','user','text']

In [26]:
# Import train data set
df = pd.read_csv(trainfile,
                 header=None, 
                 names=columns, 
                 encoding='latin-1')
# df = df.sample(10000)
df

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


## Préprocessing Sentiment140

In [27]:
df[['text']].head(10)

Unnamed: 0,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
5,@Kwesidei not the whole crew
6,Need a hug
7,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?"
8,@Tatiana_K nope they didn't have it
9,@twittera que me muera ?


In [28]:
%%time
df.text = text_preprocessing(df.text)

Wall time: 27min 14s


In [29]:
df[['text']].head(10)

Unnamed: 0,text
0,awww bummer shoulda get david carr third day
1,upset update facebook texting might cry result school today also blah
2,dive many time ball manage save rest go bound
3,whole body feel itchy like fire
4,behave mad see
5,whole crew
6,need hug
7,hey long time see yes rain bite bit lol fine thanks
8,k nope
9,que muera


In [30]:
%%time
# Sauvegarde
df.to_pickle(os.path.join("data", "sentiment140", "train.bz2"))

Wall time: 13.1 s


# Base Twitter webscrappée
## Chargement de la base

In [4]:
df = pd.read_pickle(os.path.join("data", "web", "web_parse.bz2"))
df

Unnamed: 0,search,author,time,url,text,lang,reply
0,biden,Christopher Suprun #WeArePerseus,2019-01-01 23:59:59,/TheChrisSuprun/status/1080252328886915072,Off top of my head...Forgive spelling\n\nHarris (CA)\nORourke (TX)\nCastro (TX)\nMoulton (MA)\nWarren (MA)\nBiden (DE)\nSanders (VT)\nCuomo (NY)\nGillibrand (NY)\nKlobuchar (MN)\nBrown (OH)\nGabbard (HA)\nMcAuliffe (VA)\nLandrieu (LA)\nBooker (NJ) \nBloomberg (NY)\nSchultz (WA)\nGarcetti (CA),en,False
1,biden,robbiewithoutanynumbers,2019-01-01 23:59:54,/robbienotrobin/status/1080252304509599744,"I kinda like Kamala/Beto, with Biden as Chif Of Staff. Just sayin...",en,False
2,biden,Linda Hirshman 🏊🏻‍♂️,2019-01-01 23:59:50,/LindaHirshman1/status/1080252291184189441,also Biden to Grassley. Did you know he wrote an entire memoir story of his whole life without once mentioning Anita Hill. Do we think he forgot her?,en,False
3,biden,Carol Price,2019-01-01 23:59:43,/CarolP1941/status/1080252261547180032,I am anxious to see who can pull us out of this mess Trump has got us into. It will take a strong willed person with a VP that works beside him like Biden did with Obama.,en,False
4,biden,Oliver Anderson,2019-01-01 23:59:34,/bogart7777/status/1080252220204027904,"Black people, please don’t trust Joe Biden, he wrote the crime bill(1994) that sent thousands of Black people to prison. He is not good for the Black community.",en,False
...,...,...,...,...,...,...,...
2901,trump,RobDogDiggity,2019-12-31 23:59:06,/robdogdiggity/status/1212161285648977920,"Oh, if he would’ve pulled out a gun and shot one of them, he’d be there. They’d have marches, microphones in their faces, and blaming President Trump for it.",en,False
2902,trump,the other red [Drapeau des États-Unis][Drapeau de la Macédoine],2019-12-31 23:59:05,/zivvy_1/status/1212161285040869383,"So no reality? THAT is how we get another 4 years. Stick Biden on a stage with Trump & he’ll only leave a carcass with Hunter, Biden’s voting record and mumbling gaffes. \n\nThis process is about weeding that out. Not a participation trophy. \n\nUnite behind the fittest-not just any1",en,False
2903,trump,PeterDavies,2019-12-31 23:59:05,/PeterMDavies80/status/1212161285036630016,"Trump strong! He's so tough! He's gonna show the world who's boss!\n\nwell, except for the psychopath who's building nuclear weapons and threatening the United States with them - i mean what are we supposed to go to war or something come on guys we gotta just do what he says",en,False
2904,trump,"No, Donny, you did not win the election.",2019-12-31 23:59:05,/Vote4USA2020/status/1212161284617179136,That is so true. I think the ‘election’ of someone a bigoted as trump opened Pandora’s [Paquet].,en,False


In [5]:
def filtre(text):
    lower_text = text.lower()
    if "trumps" in lower_text:
        if "trump" in lower_text.replace("trumps", ""):
            return True
        else:
            return False
    else:
        return True

La recherche du mot `trump` dans Twitter fournit aussi des tweets comprenants le mot `trumps` (issu du nom ou verbe `trump`), on supprime donc ces tweets.

In [6]:
df = df[(df.search == 'biden') | (df.search == "trump") & (df.text.apply(filtre))]
df.shape

(2868, 7)

In [7]:
df[['text']].head(10)

Unnamed: 0,text
0,Off top of my head...Forgive spelling\n\nHarris (CA)\nORourke (TX)\nCastro (TX)\nMoulton (MA)\nWarren (MA)\nBiden (DE)\nSanders (VT)\nCuomo (NY)\nGillibrand (NY)\nKlobuchar (MN)\nBrown (OH)\nGabbard (HA)\nMcAuliffe (VA)\nLandrieu (LA)\nBooker (NJ) \nBloomberg (NY)\nSchultz (WA)\nGarcetti (CA)
1,"I kinda like Kamala/Beto, with Biden as Chif Of Staff. Just sayin..."
2,also Biden to Grassley. Did you know he wrote an entire memoir story of his whole life without once mentioning Anita Hill. Do we think he forgot her?
3,I am anxious to see who can pull us out of this mess Trump has got us into. It will take a strong willed person with a VP that works beside him like Biden did with Obama.
4,"Black people, please don’t trust Joe Biden, he wrote the crime bill(1994) that sent thousands of Black people to prison. He is not good for the Black community."
5,Question is whether there are more of these guys or never-Biden youngsters in the Dem primary/convincable electorate. The oldsters the FYI vote in heavier numbers...
6,"What % of @nytimes profits is driven by spending from Russia, China, and Saudi Arabia? Must be significant firvthen to attack lifelong public servants like Joe Biden. Why don't you apply resources to investigating dark money influence instead of hit pieces on good men?"
7,"I REALLY wish Democrats would stop trying to push for Biden as a presidential candidate just because they can't think of anybody more visible.\n\nSeriously, Biden is an AWFUL choice for multiple reasons."
8,Biden/Obama ticket!!! @JoeBiden
9,I will. But then I voted Hillary. My Dad (and lots of other Republicans) didn't vote for either. He would vote Biden (Obama's VP) however.


In [8]:
%%time
df.text = text_preprocessing(df.text)

Wall time: 5.1 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [9]:
df[['text']].head(10)

Unnamed: 0,text
0,top head forgive spelling harris ca orourke tx castro tx moulton warren biden de sander vt cuomo ny gillibrand ny klobuchar mn brown oh gabbard ha mcauliffe va landrieu la booker nj bloomberg ny schultz wa garcetti ca
1,kinda like kamala beto biden chif staff sayin
2,also biden grassley know write entire memoir story whole life without mention anita hill think forgot
3,anxious see pull u mess trump get u take strong will person vp work beside like biden obama
4,black people please trust joe biden write crime bill send thousand black people prison good black community
5,question whether guy never biden youngster dem primary convincable electorate oldster fyi vote heavy number
6,profit drive spending russia china saudi arabia must significant firvthen attack lifelong public servant like joe biden apply resource investigate dark money influence instead hit piece good men
7,really wish democrat would stop try push biden presidential candidate think anybody visible seriously biden awful choice multiple reason
8,biden obama ticket
9,vote hillary dad lot republicans vote either would vote biden obama vp however


In [11]:
%%time
# Sauvegarde
df.to_pickle(os.path.join("data", "web", "web.bz2"))

Wall time: 113 ms
