In [122]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [98]:
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

In [99]:
df = pd.read_csv('https://github.com/murpi/wilddata/raw/master/quests/tweets.zip')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [100]:
# Conserve uniquement les tweets positifs et négatifs (donc tu exclus les neutrals). Quel est le pourcentage de tweets positifs/négatifs ?
df = df[df['sentiment'].isin(['negative', 'positive'])]
df.sentiment.value_counts(normalize=True)*100

positive    52.447595
negative    47.552405
Name: sentiment, dtype: float64

In [101]:
# créer la fonction clean
nlp = spacy.load('en_core_web_sm')
stopwordsenglish = stopwords.words('english')

def clean(sentence):
  tokens = word_tokenize(sentence)
  clean_tokens = [w for w in tokens if w.isalnum() and not w in stopwordsenglish]
  clean_sent = ' '.join(clean_tokens)
  lemma = nlp(clean_sent)
  lemma_tokens= [word.lemma_ for word in lemma]
  lemma_sentence = ' '.join(lemma_tokens)
  return lemma_sentence

In [102]:
clean("You are better when I am well.")

'-PRON- better -PRON- well'

In [103]:
df['clean'] = df['text'].apply(clean)
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,clean
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD -PRON- miss San Diego
2,088c60f138,my boss is bullying me...,bullying me,negative,boss bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already buy
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,2 a.m. feeding baby fun smile coo


In [106]:
X = df['clean']
y= df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 32, train_size = 0.75)

# Countvectorizer

In [107]:
# Countvectorizer
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
X_train_cv = vectorizer.transform(X_train)
X_test_cv = vectorizer.transform(X_test)


<4091x12460 sparse matrix of type '<class 'numpy.int64'>'
	with 27819 stored elements in Compressed Sparse Row format>

In [119]:
# Logistic Regression
modelLR = LogisticRegression(max_iter=120).fit(X_train_cv, y_train)

print('Accuracy score on the train set :', modelLR.score(X_train_cv, y_train))
print('Accuracy score on the test set :', modelLR.score(X_test_cv, y_test))  

Accuracy score on the train set : 0.9496414602346805
Accuracy score on the test set : 0.8587142507944268


In [121]:
# KNN 
modelKNN = KNeighborsClassifier().fit(X_train_cv, y_train)
print('Accuracy score on the train set :', modelKNN.score(X_train_cv, y_train))
print('Accuracy score on the test set :', modelKNN.score(X_test_cv, y_test))  

Accuracy score on the train set : 0.8151075619295959
Accuracy score on the test set : 0.7267171840625763


In [123]:
# Decistion tree
modelCTC = DecisionTreeClassifier().fit(X_train_cv, y_train)
print('Accuracy score on the train set :', modelCTC.score(X_train_cv, y_train))
print('Accuracy score on the test set :', modelCTC.score(X_test_cv, y_test))  

Accuracy score on the train set : 0.9992666232073012
Accuracy score on the test set : 0.80298215595209


# TF-IDF

In [125]:
# TF-IDF
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
X_train_cv2 = vectorizer.transform(X_train)
X_test_cv2 = vectorizer.transform(X_test)

In [126]:
# Logistic Regression
modelLR = LogisticRegression(max_iter=120).fit(X_train_cv2, y_train)

print('Accuracy score on the train set :', modelLR.score(X_train_cv2, y_train))
print('Accuracy score on the test set :', modelLR.score(X_test_cv2, y_test))  

Accuracy score on the train set : 0.9240547588005215
Accuracy score on the test set : 0.8653141041310193


In [127]:
# KNN 
modelKNN = KNeighborsClassifier().fit(X_train_cv2, y_train)
print('Accuracy score on the train set :', modelKNN.score(X_train_cv2, y_train))
print('Accuracy score on the test set :', modelKNN.score(X_test_cv2, y_test))  

Accuracy score on the train set : 0.6882333767926988
Accuracy score on the test set : 0.5881202639941334


In [128]:
# Decistion tree
modelCTC = DecisionTreeClassifier().fit(X_train_cv2, y_train)
print('Accuracy score on the train set :', modelCTC.score(X_train_cv2, y_train))
print('Accuracy score on the test set :', modelCTC.score(X_test_cv2, y_test))  

Accuracy score on the train set : 0.9991851368970013
Accuracy score on the test set : 0.7980933757027622


Le meilleur score est celui de la logistic regression après TF-IDF.