# Preprocessing and understanding the Dataset

In [23]:
pip install mechanize

Collecting mechanize
  Downloading mechanize-0.4.7-py2.py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 1.5 MB/s eta 0:00:01
Installing collected packages: mechanize
Successfully installed mechanize-0.4.7
Note: you may need to restart the kernel to use updated packages.


In [275]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from mechanize import Browser
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [276]:
dataset = pd.read_csv('Apple-Twitter-Sentiment-DFE.csv')

In [277]:
dataset.shape

(3886, 12)

In [278]:
dataset.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,date,id,query,sentiment_gold,text
0,623495513,True,golden,10,,3,0.6264,Mon Dec 01 19:30:03 +0000 2014,5.4e+17,#AAPL OR @Apple,3\nnot_relevant,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,623495514,True,golden,12,,3,0.8129,Mon Dec 01 19:43:51 +0000 2014,5.4e+17,#AAPL OR @Apple,3\n1,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,623495515,True,golden,10,,3,1.0,Mon Dec 01 19:50:28 +0000 2014,5.4e+17,#AAPL OR @Apple,3,My cat only chews @apple cords. Such an #Apple...
3,623495516,True,golden,17,,3,0.5848,Mon Dec 01 20:26:34 +0000 2014,5.4e+17,#AAPL OR @Apple,3\n1,I agree with @jimcramer that the #IndividualIn...
4,623495517,False,finalized,3,12-12-2014 12:14,3,0.6474,Mon Dec 01 20:29:33 +0000 2014,5.4e+17,#AAPL OR @Apple,,Nobody expects the Spanish Inquisition #AAPL


In [323]:
dataset['sentiment'].unique()
text_df = dataset.loc[:, ['sentiment', 'text']]

we will remove the tweets for which the sentiment is non relevant

In [324]:
text_df = text_df[text_df['sentiment'] != 'not_relevant']
text_df.shape

(3804, 2)

In [325]:
text_df.dtypes

sentiment    object
text         object
dtype: object

In [332]:
text_df['text']

0       #AAPL:The 10 best Steve Jobs emails ever...htt...
1       RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2       My cat only chews @apple cords. Such an #Apple...
3       I agree with @jimcramer that the #IndividualIn...
4            Nobody expects the Spanish Inquisition #AAPL
                              ...                        
3881    (Via FC) Apple Is Warming Up To Social Media -...
3882    RT @MMLXIV: there is no avocado emoji may I as...
3883    @marcbulandr I could not agree more. Between @...
3884    My iPhone 5's photos are no longer downloading...
3885    RT @SwiftKey: We're so excited to be named to ...
Name: text, Length: 3804, dtype: object

1. Remove the html, @, RT, AAPL, Apple, punctuation from all the tweets.

In [404]:
def clean_text(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('@([A-Za-z0-9]+)', '', text)
    text = re.sub('RT (@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = text.lower()
    text = re.sub('aapl', '', text)
    text = re.sub('apple', '', text)
    text = re.sub('rt', '', text)
    return text

In [405]:
text_df['text'] = text_df['text'].apply(lambda x: clean_text(x))

2. Tokenization

In [408]:
tokenizer = RegexpTokenizer(r'\w+')

In [409]:
text_df['text'] = text_df['text'].apply(lambda x: tokenizer.tokenize(x))

In [410]:
text_df['text']

0                  [the, best, steve, jobs, emails, ever]
1           [why, stock, had, a, miniflash, crash, today]
2           [my, cat, only, chews, cords, such, an, snob]
3       [i, agree, with, that, the, individualinvestor...
4            [nobody, expects, the, spanish, inquisition]
                              ...                        
3881    [via, fc, is, warming, up, to, social, media, ...
3882    [there, is, no, avocado, emoji, may, i, ask, why]
3883    [i, could, not, agree, more, between, and, onl...
3884    [my, iphone, photos, are, no, longer, download...
3885    [were, so, excited, to, be, named, to, s, app,...
Name: text, Length: 3804, dtype: object

3. Removing stop words.

In [411]:
import string
def remove_stop_words(text):
    word_list = [word for word in text if word not in stopwords.words('english')]
    return word_list

In [412]:
text_df['text'] = text_df['text'].apply(lambda x: remove_stop_words(x))

4. Lemmatizing

In [417]:
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    lemmatized_list = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_list

In [419]:
text_df['text'] = text_df['text'].apply(lambda x: lemmatize(x))

In [420]:
text_df['text']

0                         [best, steve, job, email, ever]
1                        [stock, miniflash, crash, today]
2                                 [cat, chew, cord, snob]
3       [agree, individualinvestor, trade, extended, t...
4                 [nobody, expects, spanish, inquisition]
                              ...                        
3881    [via, fc, warming, social, medium, hiring, soc...
3882                           [avocado, emoji, may, ask]
3883    [could, agree, great, thing, happen, andibm, i...
3884    [iphone, photo, longer, downloading, automatic...
3885       [excited, named, app, store, best, list, year]
Name: text, Length: 3804, dtype: object