In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [22]:
train = pd.read_csv('../training.1600000.processed.noemoticon.csv', encoding='iso-8859-1', header=None, names=['sentiment','id', 'date', 'query', 'user', 'text'])

In [23]:
train.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [24]:
train.drop(columns=['id','query','user','date'], axis=1, inplace=True)

In [25]:
train.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [26]:
from nltk.corpus import stopwords
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [27]:
import re
import string
from nltk.stem import PorterStemmer

#create an object of class PorterStemmer
porter = PorterStemmer()

edited_stop_words = [re.sub('[\']+', '',word) for word in stopwords.words('english')]

auxiliaryVerbs = ['do','does','did','has','have','had','should','must','can','could']
splitNegativeWords = re.compile(r'('+r'|'.join(auxiliaryVerbs)+r')n?\'t')

def prepare_tweet(tweet_text):
    # Removing Hashtags
    tweet_aux = re.sub('#[A-Za-z0-9_-]+', '', str.lower(tweet_text))
    # Removing mentions
    tweet_aux = re.sub('@[A-Za-z0-9_-]+', '', tweet_text)
    # Removing URLs
    tweet_aux =  re.sub('https?://[^ ]+', '', tweet_aux)
    tweet_aux =  re.sub('www.[^ ]+', '', tweet_aux)
    # Removing symbols and numbers
    tweet_aux = re.sub('[^A-Za-z \n]+', '', tweet_aux)
    ## Remove stopwords - Commented because resulted in a worse accuracy
    ## tweet_aux =  [word for word in tweet.split() if word not in edited_stop_words]
    tweet_aux = splitNegativeWords.sub("\\1 not", tweet_aux)
    # Stemming
    stem_tokens = []
    for token in tweet_aux.split():
        stem_tokens.append(porter.stem(token))
    return stem_tokens

In [28]:
import datetime
datetime.datetime.now()

datetime.datetime(2019, 2, 8, 11, 27, 55, 564396)

In [29]:
number_rows = 100000
train_100k = pd.concat([train[0:int(number_rows/2)].copy(),train[800000:800000+int(number_rows/2)].copy()])
train_100k['tokens'] = train_100k['text'].apply(prepare_tweet)

In [30]:
import functools
import collections
word_count = collections.Counter(functools.reduce(lambda a,b: a+b, train_100k['tokens']))

In [31]:
word_count_dict = dict(word_count)

In [32]:
datetime.datetime.now()

datetime.datetime(2019, 2, 8, 11, 43, 53, 850801)

In [33]:
[word for (word, count) in word_count.most_common(20)]

['to',
 'the',
 'I',
 'a',
 'it',
 'and',
 'my',
 'you',
 'is',
 'i',
 'in',
 'for',
 'of',
 'that',
 'have',
 'on',
 'me',
 'go',
 'but',
 'be']

In [34]:
# train_10k = train[0:10000].copy()
# train_10k['tokens'] = train_10k['text'].apply(prepare_tweet)

In [35]:
# import functools
# import collections
# word_count = collections.Counter(functools.reduce(lambda a,b: a+b, train_10k['tokens']))

In [36]:
# train_10k.head(20)

In [37]:
# [word for (word, count) in word_count.most_common(200)]

In [38]:
import csv

with open('word_count_100k.csv', 'w') as csvfile: 
    w = csv.DictWriter(csvfile, word_count_dict.keys())
    w.writeheader()
    w.writerow(word_count)

In [121]:
word_list = [word for (word, count) in word_count.most_common(200)]

In [122]:
word_list.sort()

In [123]:
word_list

['actual',
 'alreadi',
 'alway',
 'amp',
 'anoth',
 'anyth',
 'away',
 'babi',
 'back',
 'bad',
 'bed',
 'believ',
 'best',
 'better',
 'big',
 'bit',
 'bore',
 'call',
 'cant',
 'car',
 'clean',
 'cold',
 'come',
 'could',
 'cri',
 'damn',
 'day',
 'die',
 'done',
 'earli',
 'eat',
 'end',
 'even',
 'ever',
 'everyon',
 'exam',
 'feel',
 'final',
 'find',
 'finish',
 'first',
 'follow',
 'friend',
 'fuck',
 'fun',
 'game',
 'get',
 'girl',
 'give',
 'go',
 'gone',
 'gonna',
 'good',
 'got',
 'gotta',
 'great',
 'guess',
 'guy',
 'haha',
 'happen',
 'happi',
 'hard',
 'hate',
 'head',
 'headach',
 'hear',
 'help',
 'home',
 'homework',
 'hope',
 'hour',
 'hous',
 'hurt',
 'ill',
 'im',
 'ive',
 'keep',
 'know',
 'last',
 'late',
 'leav',
 'left',
 'let',
 'life',
 'like',
 'littl',
 'live',
 'lol',
 'long',
 'look',
 'lost',
 'lot',
 'love',
 'made',
 'make',
 'man',
 'mani',
 'may',
 'mayb',
 'mean',
 'miss',
 'mom',
 'monday',
 'morn',
 'mother',
 'movi',
 'much',
 'need',
 'never',


In [130]:
train_100k.to_csv('train_100k.csv', sep='\t', index=False)

In [142]:
train_test = pd.merge(train[0:10].copy(),train[800000:800010].copy())

In [143]:
train_test.head(20)

Unnamed: 0,sentiment,text


In [209]:
word_count

{'awww': 265,
 'that': 1825,
 'bummer': 74,
 'shoulda': 18,
 'got': 3738,
 'david': 73,
 'carr': 10,
 'third': 52,
 'day': 6652,
 'upset': 178,
 'cant': 3835,
 'updat': 563,
 'facebook': 295,
 'text': 256,
 'might': 563,
 'cri': 450,
 'result': 121,
 'school': 1723,
 'today': 4432,
 'also': 650,
 'blah': 96,
 'dive': 22,
 'mani': 584,
 'time': 4132,
 'ball': 101,
 'manag': 191,
 'save': 224,
 'rest': 390,
 'go': 8997,
 'bound': 22,
 'whole': 350,
 'bodi': 166,
 'feel': 3135,
 'itchi': 31,
 'like': 5069,
 'fire': 109,
 'behav': 23,
 'im': 10497,
 'mad': 205,
 'see': 3012,
 'crew': 42,
 'need': 2694,
 'hug': 282,
 'hey': 1109,
 'long': 1043,
 'ye': 1146,
 'rain': 969,
 'bit': 712,
 'lol': 3041,
 'fine': 262,
 'thank': 3528,
 'how': 196,
 'nope': 149,
 'que': 32,
 'muera': 1,
 'spring': 368,
 'break': 550,
 'plain': 29,
 'citi': 250,
 'snow': 299,
 'repierc': 1,
 'ear': 144,
 'bear': 61,
 'watch': 2550,
 'thought': 728,
 'ua': 4,
 'loss': 92,
 'embarrass': 30,
 'count': 141,
 'idk': 98,
 