About this file

This is the sentiment140 dataset.
It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment .
It contains the following 6 fields:

    target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
    ids: The id of the tweet ( 2087)
    date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
    flag: The query (lyx). If there is no query, then this value is NO_QUERY.
    user: the user that tweeted (robotickilldozr)
    text: the text of the tweet (Lyx is cool)

In [None]:
import pandas as pd

df_tmp = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)

In [None]:
df = pd.DataFrame(df_tmp.values, columns=["target", "ids", "date", "flag", "user", "text"])
df

In [None]:
df['text'][:10].values

In [None]:
# Extract only a subset of the data
nb_lines = 1000

extract = pd.concat([df[df['target'] == 0][:nb_lines], 
                     df[df['target'] == 2][:nb_lines],
                     df[df['target'] == 4][:nb_lines]])

extract

# Test with no prior treament

## Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(extract['text'].values)
print(vectorizer.get_feature_names())
print(X.shape)

In [None]:
len(vectorizer.get_feature_names())

We observe:
- @username, that we can remove
- Useless numbers, i.e. 
- Abbreviations like 2mmorow
- Repeatited characters like aaaaawwww

In my sample of 2000 tweets I got a vocabulary of ~5700 words, our goal is to reduce it to what is actually useful.

## Remove @ username

In [None]:
def process_tweet(x):
    # Process sentence
    x = x.lower()
    new_sentence = []
    # Process each word of the sentence
    for word in x.split():
        # Remove @
        if word[0] == '@':
            continue
        new_sentence.append(word)
    return ' '.join(new_sentence)
    
extract['text'].apply(process_tweet)

## Remove URLs and punctuation, but keep emojis

Note : we keep the same `process_tweet` function and just add more fonctionnalities.

In [None]:
import re

In [None]:
emoji_dict = {
    ";D": "PositiveSmiley",
    ";-D": "PositiveSmiley",
    ":D": "PositiveSmiley",
    ":-D": "PositiveSmiley",
    "xD": "PositiveSmiley",
    ":)": "PositiveSmiley",
    ":')": "PositiveSmiley",
    ":-)": "PositiveSmiley",
    "D:": "NegativeSmiley",
    ":(": "NegativeSmiley",
    ":-(": "NegativeSmiley",
    ":'('": "NegativeSmiley",
}

In [None]:
def process_tweet(x):
    # Process emojis
    for emoji in emoji_dict.keys():
        x = x.replace(emoji, emoji_dict[emoji])
    # Process sentence
    x = x.lower()
    
    new_sentence = []
    # Process each word of the sentence
    for word in x.split():
        # Remove @
        if word[0] == '@':
            continue
        # Remove URLs
        if len(word.split("http")) > 1:
            continue
        new_sentence.append(word)
    x = ' '.join(new_sentence)
    # Remove punctuations
    x = re.sub(r'[^\w\s]', "", x)
    return x

print(extract['text'].values)
extract['text'].apply(process_tweet).values

# Remove repeating characters

In [None]:
def rm_multiple_chars(word):
    new_word = ""
    same_occurence = 0
    last_char = ''
    for c in word:
        if last_char and last_char == c:
            same_occurence += 1
            if same_occurence > 1:
                continue
        else:
            last_char = c
            same_occurence = 0
        new_word += c
        
    return new_word

print(rm_multiple_chars("aaawwwww"))
print(rm_multiple_chars("aww"))
print(rm_multiple_chars("aaaaaarrrrrrgggggg"))

In [None]:
def process_tweet(x):
    # Process emojis
    for emoji in emoji_dict.keys():
        x = x.replace(emoji, emoji_dict[emoji])
    # Process sentence
    x = x.lower()
    
    new_sentence = []
    # Process each word of the sentence
    for word in x.split():
        if word[0] == '@':
            continue
        if len(word.split("http")) > 1:
            continue
        new_sentence.append(rm_multiple_chars(word))
    x = ' '.join(new_sentence)
    x = re.sub(r'[^\w\s]', "", x)
    return x

print(extract['text'].values)
extract['text'].apply(process_tweet).values

# Lemmatize

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
 
lemmatizer.lemmatize("rocks")

In [None]:
# Lemmatized has to be initialized before calling 'process_tweet'

lemmatizer = WordNetLemmatizer()

def process_tweet(x):
    # Process emojis
    for emoji in emoji_dict.keys():
        x = x.replace(emoji, emoji_dict[emoji])
    x = x.lower()
    
    # Process each word of the sentence
    new_sentence = []
    for word in x.split():
        if word[0] == '@':
            continue
        if len(word.split("http")) > 1:
            continue
        word = rm_multiple_chars(word)
        # Lemmatize
        word = lemmatizer.lemmatize(word)
        new_sentence.append(word)
    x = ' '.join(new_sentence)
    
    # Remove punctuations and digits
    x = re.sub(r'[^\w\s]', "", x)
    x = re.sub(r'[0-9]+', "", x)
    return x

print(extract['text'][:10].values)
print(extract['text'][:10].apply(process_tweet).values)

## Learn

In [None]:
processed_extract = extract['text'].apply(process_tweet)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_extract.values)
print(X.shape)
print(vectorizer.get_feature_names())

In [None]:
y = extract['target'].values.tolist()
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

print(confusion_matrix(y_test, clf.predict(X_test)))

print(accuracy_score(y_test, clf.predict(X_test)))