In [1]:
import time
import math
import os
import re
import string

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize, regexp_tokenize, TweetTokenizer

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from IPython.display import display, HTML

In [2]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 100)

In [3]:
# Read in data
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='ISO-8859-1')

In [4]:
df.dropna(subset=['tweet_text'], axis=0, inplace=True)


In [5]:
# Rename columns
df = df.rename(columns = {'tweet_text': 'tweet', 
                         'emotion_in_tweet_is_directed_at': 'product', 
                         'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'})

In [19]:
df['apple'] = df['tweet'].str.contains('Apple')

In [20]:
df.head()

Unnamed: 0,tweet,product,sentiment,apple
0,"[g, iphone, hr, tweet, rise, austin, dead, need, upgrade, plugin, station]",iPhone,Negative emotion,
1,"[know, awesome, ipad, iphone, app, likely, appreciate, design, also, give, free]",iPad or iPhone App,Positive emotion,
2,"[wait, ipad, also, sale]",iPad,Positive emotion,
3,"[hope, year, festival, crashy, year, iphone, app]",iPad or iPhone App,Negative emotion,
4,"[great, stuff, fri, marissa, mayer, google, tim, reilly, tech, book, conference, matt, mullenweg...",Google,Positive emotion,


In [18]:
df.groupby(df['sentiment'])['product'].value_counts()

sentiment                           product                        
I can't tell                        iPad                                 4
                                    Apple                                2
                                    Google                               1
                                    Other Google product or service      1
                                    iPhone                               1
Negative emotion                    iPad                               125
                                    iPhone                             103
                                    Apple                               95
                                    Google                              68
                                    iPad or iPhone App                  63
                                    Other Google product or service     47
                                    Android                              8
                                

In [16]:
##### delete
df_neutral = df.loc[df['sentiment']=='No emotion toward brand or product']


Unnamed: 0,tweet,product,sentiment
0,"[g, iphone, hr, tweet, rise, austin, dead, need, upgrade, plugin, station]",iPhone,Negative emotion
1,"[know, awesome, ipad, iphone, app, likely, appreciate, design, also, give, free]",iPad or iPhone App,Positive emotion
2,"[wait, ipad, also, sale]",iPad,Positive emotion
3,"[hope, year, festival, crashy, year, iphone, app]",iPad or iPhone App,Negative emotion
4,"[great, stuff, fri, marissa, mayer, google, tim, reilly, tech, book, conference, matt, mullenweg...",Google,Positive emotion


In [6]:
def replace_emoticons(text):
    # Define a dictionary mapping emoticons to their corresponding meanings
    emoticon_mapping = {
        ':D': 'emoji_smile',
        ':)': 'emoji_smile',
        ':-D': 'emoji_smile',
        ':\'': 'emoji_unsure',
        ':p': 'emoji_tongue',
        ':P': 'emoji_tongue',
        ':(': 'emoji_sad'
        # Add more emoticons and their meanings as needed
    }
    pattern = re.compile('|'.join(re.escape(emoticon) for emoticon in emoticon_mapping.keys()))
    def replace(match):
        return emoticon_mapping[match.group(0)]

    return pattern.sub(replace, text)


In [7]:
df['tweet'] = df['tweet'].apply(replace_emoticons)

In [8]:
# Function to Lemmatize a document
def lemmatize_word(word):
    
    lemmatizer = WordNetLemmatizer()
    
    token, tag = pos_tag([word])[0]
    
    if tag.startswith('J'):
        wordnet_pos = wordnet.ADJ
    elif tag.startswith('V'):
        wordnet_pos = wordnet.VERB
    elif tag.startswith('N'):
        wordnet_pos = wordnet.NOUN
    elif tag.startswith('R'):
        wordnet_pos = wordnet.ADV    
    else:
        wordnet_pos = wordnet.NOUN
    
    return lemmatizer.lemmatize(token, pos=wordnet_pos)

In [9]:
# Function to preprocess tweet

def preprocess_tweet(tweet):
    #remove links
    tweet = re.sub(r'http\S+|@\S+', '', tweet)
    
    # remove {link} from rows
    tweet = re.sub(r'\{link\}', '', tweet)
    
    # Replace &quot; with "
    tweet = tweet.replace('&quot;', '"')
    
    # Remove extra space between quotation mark and words
    tweet = re.sub(r'\s+"', '"', tweet)
    tweet = re.sub(r'"\s+', '"', tweet)
    
    #convert to lowercase
    tweet = tweet.lower()
    
    # remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    
    #remove punctuation 
    tweet = re.sub(r'([^\w\s]|_)+', ' ', tweet)
    
    #tokenize
    tokens = word_tokenize(tweet)
    
    #add stopwords
    additional_stopwords = {'w', 'u', 'amp', 'sxsw', 'rt'} # amp = & 
    stop_words = set(stopwords.words('english'))
    stop_words.update(additional_stopwords)
    
    # Lemmatize tokens
    tokens_lemmed = [lemmatize_word(token) for token in tokens]
    
    # Exclude stopwords from tokenized tweet
    tokenized_tweet = [word for word in tokens_lemmed if word not in stop_words]
    
    return tokenized_tweet

In [10]:
start = time.time()
df['tweet'] = df['tweet'].apply(preprocess_tweet)
end = time.time()
print(f'{end-start} seconds')

132.05321717262268 seconds


In [21]:
df2 = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='ISO-8859-1')

In [23]:
df_neg = df2.loc[df2['is_there_an_emotion_directed_at_a_brand_or_product']=='Negative emotion']

In [25]:
df_neg['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               125
iPhone                             103
Apple                               95
Google                              68
iPad or iPhone App                  63
Other Google product or service     47
Android                              8
Android App                          8
Other Apple product or service       2
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [26]:
apple_mask = ['iPad','iPhone','Apple','iPad or iPhone App','Other Apple product or service']

In [28]:
df_neg_apple = df_neg.loc[df_neg['emotion_in_tweet_is_directed_at'].isin(apple_mask)]

In [29]:
df_neg_apple.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
8855,"If there was a popup store in Austin that sold nothing but iPhone battery extenders, it would ma...",iPhone,Negative emotion
8930,#iPad #news #apps not popular with the #kids. {link} #the_daily is a terrible concept anyway #sxsw,iPad or iPhone App,Negative emotion
8943,Hmmm...Taxi Magic on iPhone does not appear to be so magic any more in Austin #sxsw,iPad or iPhone App,Negative emotion
8981,I think my effing hubby is in line for an #iPad 2. Can someone point him towards the line-up for...,iPad,Negative emotion
9008,I'm pretty sure the panelist that thinks &quot;Apple is drowning in their success&quot; is fucki...,Apple,Negative emotion


In [30]:
display(HTML(df_neg_apple.to_html(index=False)))

tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
I just noticed DST is coming this weekend. How many iPhone users will be an hour late at SXSW come Sunday morning? #SXSW #iPhone,iPhone,Negative emotion
attending @mention iPad design headaches #sxsw {link},iPad,Negative emotion
What !?!? @mention #SXSW does not provide iPhone chargers?!? I've changed my mind about going next year!,iPhone,Negative emotion
"Seriously #sxsw? Did you do any testing on the mobile apps? Constant iPad crashes causing lost schedules, and no sync for WP7.",iPad or iPhone App,Negative emotion
ipad2 and #sxsw...a conflagration of doofusness. {link},iPad,Negative emotion
"You spent $1,000+ to come to SXSW. \n\nYou've already used iPad 1. \n\nThe wait is a couple city blocks. \n\nWhy? #ipad2 #SXSW {link}",iPad,Negative emotion
"I'm up to 2 iPad 2s seen in the wild. Both people say it is fast, but the still pics are terrible. #sxsw",iPad,Negative emotion
"If iPhone alarms botch the timechange, how many #SXSW'ers freak? Late to flights, missed panels, behind on bloody marys...",iPhone,Negative emotion


In [13]:
# Tag tokenized_tweets with an index for identification
tweets = [TaggedDocument(tweet, [i]) for i, tweet in enumerate(df['tweet'])]

In [15]:
# Initialize and train a Doc2Vec model
start = time.time()
model = Doc2Vec(tweets, vector_size=50, window=2, min_count=1, workers=4, epochs=40)
end = time.time()
print(f'{end-start} seconds')

32.814871311187744 seconds
