In [1]:
import nltk

In [2]:
import pandas as pd
train= pd.read_csv('./nlp/tweets.csv', sep='^([^,]+),', engine='python', error_bad_lines=False, encoding='utf-8', index_col=[0])

In [3]:
train = train.reset_index(drop=True)

In [4]:
train['polarity'] = train['polarity'].str.replace("0","negatif").str.replace("4","positif")

In [5]:
index_zero = train[(train['polarity'] != 'positif') & (train['polarity']!= 'negatif')].index

In [6]:
train.drop(index_zero, inplace=True)

In [7]:
train = train.sample(n=100000)

In [8]:
def clean_up(s):
    import re
    s= re.sub(r'http\S+|(www\.[^\s]+)|(@\S+)',' ',s)
    s=re.sub('[\d]',' ',s)
    s= re.sub('[^\w\s]',' ',s)
    s= re.sub('\s\s+',' ',s)
    s=s.lower().strip()
    return s
def tokenize(s):
    from nltk.tokenize import word_tokenize
    return word_tokenize(s)
def stem_spacy(s):
    import spacy
    nlp = spacy.load('fr_core_news_sm')
    s = s.apply(nlp)
    tokens=[]
    for doc in s:
        tokens.append([n.lemma_ for n in doc])
    return tokens
def stem(tweet):
    from nltk.stem.snowball import FrenchStemmer
    stemmer = FrenchStemmer()
    return list(map(lambda s: stemmer.stem(s), tweet))
def remove_stopwords(s):
    from nltk.corpus import stopwords
    stop_words = stopwords.words('french')
    s=[x for x in s if x not in stop_words]
    s=[x for x in s if len(x)>1]
    return s

In [9]:
train.statutnull=train.statutnull.astype(str)

In [10]:
train.statutnull = train.statutnull.apply(lambda s: clean_up(s))

In [11]:
train.head()

Unnamed: 0,polarity,statutnull
445938,negatif,j ai besoin de prendre mes drogues à nouveau j...
1358033,positif,se lèvera quand je me réveillerai
1132766,positif,omg j adore cette chanson
686640,negatif,votre blog était très instructif comment peut ...
228905,negatif,j ai pleuré pendant minutes après les nuits à ...


In [12]:
train.statutnull = train.statutnull.apply(lambda s: tokenize(s))

In [13]:
train.head()

Unnamed: 0,polarity,statutnull
445938,negatif,"[j, ai, besoin, de, prendre, mes, drogues, à, ..."
1358033,positif,"[se, lèvera, quand, je, me, réveillerai]"
1132766,positif,"[omg, j, adore, cette, chanson]"
686640,negatif,"[votre, blog, était, très, instructif, comment..."
228905,negatif,"[j, ai, pleuré, pendant, minutes, après, les, ..."


In [14]:
train.statutnull = train.statutnull.apply(lambda s: stem(s))

In [15]:
train.head()

Unnamed: 0,polarity,statutnull
445938,negatif,"[j, ai, besoin, de, prendr, me, drogu, à, nouv..."
1358033,positif,"[se, lev, quand, je, me, réveil]"
1132766,positif,"[omg, j, ador, cet, chanson]"
686640,negatif,"[votr, blog, était, tres, instruct, comment, p..."
228905,negatif,"[j, ai, pleur, pend, minut, apres, le, nuit, à..."


In [16]:
train.statutnull = train.statutnull.apply(lambda s: remove_stopwords(s))

In [17]:
train.head()

Unnamed: 0,polarity,statutnull
445938,negatif,"[besoin, prendr, drogu, nouveau, souffr, nouveau]"
1358033,positif,"[lev, quand, réveil]"
1132766,positif,"[omg, ador, cet, chanson]"
686640,negatif,"[votr, blog, tres, instruct, comment, peut, to..."
228905,negatif,"[pleur, pend, minut, apres, nuit, rodanth, ple..."


In [18]:
train.rename(columns={'statutnull':'tweet'},inplace=True)

In [19]:
all_words = []
import nltk
NUM_FEATURES = 5000
for index, value in train.tweet.iteritems():
    if value not in all_words:
        all_words += value

top_features = [x[0] for x in nltk.FreqDist(all_words).most_common(NUM_FEATURES)]

In [20]:
def build_features(words):
    features = {}
    for w in top_features:
        features[w] = (w in words)
    return features

In [21]:
featuresets = []

for index, row in train.iterrows():
    featuresets.append((build_features(row['tweet']), row['polarity']))

In [22]:
from nltk import NaiveBayesClassifier
from sklearn.model_selection import train_test_split
train_set, test = train_test_split(featuresets, test_size=0.2)
classifier = NaiveBayesClassifier.train(train_set)

In [23]:
nltk.classify.accuracy(classifier, test)

0.7372