# Sentiment analysis on tweets

In [1]:
import Preprocessing
import numpy as np

## example

get tweets from vmware adressing DELLEMC

In [2]:
query =  "SELECT DISTINCT text FROM tweet INNER JOIN user ON user_id = user.id "
query += "WHERE screen_name = 'vmware' AND instr(text, 'DELLEMC');"
data = Preprocessing.execute(query)
data = list(map(lambda x: x[0], data))
data[:3]

['RT @MichaelDell: #VxRail Hyperconverged Appliance dramatically simplifies path to Hybrid Cloud. Hyper Fast @DellTech @VMware @DellEMC https…',
 'RT @ElReg: Hyperconvergence to land @CommBank in win for @vmware and @dellEMC https://t.co/kIBjhi5IHJ https://t.co/2pZJCKpGNK',
 "Don't miss the packed lineup of general session speakers at #DellEMCWorld, including @PGelsinger on Tuesday at 10AM… https://t.co/JyPx148zOV"]

remove @, #, RT and urls

In [3]:
import re

w_data = data
w_data = [ t.replace("@", "") for t in w_data ]
w_data = [ t.replace("#", "") for t in w_data ]
w_data = [ t.replace("RT", "") for t in w_data ]
url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
w_data = [ re.sub(url, '', t) for t in w_data ]
w_data[:3]

[' MichaelDell: VxRail Hyperconverged Appliance dramatically simplifies path to Hybrid Cloud. Hyper Fast DellTech VMware DellEMC https…',
 ' ElReg: Hyperconvergence to land CommBank in win for vmware and dellEMC  ',
 "Don't miss the packed lineup of general session speakers at DellEMCWorld, including PGelsinger on Tuesday at 10AM… "]

use of textblob lib to evaluate positivty and subjectivity

In [4]:
from textblob import TextBlob

for t in w_data[:3]:
    sent = TextBlob(t).sentiment
    print ('Tweet: {}\n Positivity: {}\n Subjectivity: {}\n'.format(t, sent[0], sent[1]))

Tweet:  MichaelDell: VxRail Hyperconverged Appliance dramatically simplifies path to Hybrid Cloud. Hyper Fast DellTech VMware DellEMC https…
 Positivity: 0.2
 Subjectivity: 0.6

Tweet:  ElReg: Hyperconvergence to land CommBank in win for vmware and dellEMC  
 Positivity: 0.8
 Subjectivity: 0.4

Tweet: Don't miss the packed lineup of general session speakers at DellEMCWorld, including PGelsinger on Tuesday at 10AM… 
 Positivity: 0.05000000000000002
 Subjectivity: 0.5



## Application on labels 

In [5]:
from langdetect import detect

labels = [
[ "VMware", ["DELLEMC", "Delltech"] ],
[ "Generalelectric", ["Shinola"] ],
[ "MaerskLine", ["IBMblockchain"] ],
[ "Alstom", ["hydrogenics", "thecosmocompany", "Frauschersensor"] ],
[ "Intel", ["BMW", "Dell", "NASA"] ],
[ "Salesforce", ["IBM", "Google", "Quip"] ],
[ "Airliquidegroup", ["Wagaenergy", "ToyotaMotorcorp", "hypeTaxi", "AMA_SAfrance"] ]
]

def sentiment_analysis_partner(company, partner):
    
    query =  "SELECT DISTINCT text FROM tweet INNER JOIN user ON user_id = user.id "
    query += "WHERE screen_name = '" + company +  "' AND instr(text, '" + partner + "');"
    data = Preprocessing.execute(query)
    data = list(map(lambda x: x[0], data))

    data = [ t for t in data if detect(t) == "en" ]

    # remove @, #, RT and urls
    data = [ t.replace("@", "") for t in data ]
    data = [ t.replace("#", "") for t in data ]
    data = [ t.replace("RT", "") for t in data ]
    url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    data = [ re.sub(url, '', t) for t in data ]

    sent = [ TextBlob(t).sentiment for t in data ]
    return [company, partner, len(data),
           np.mean(list(map(lambda x: x[0], sent))),
           np.mean(list(map(lambda x: x[1], sent)))]

show results

In [6]:
import pandas as pd

analysis = [ sentiment_analysis_partner(c[0], p) for c in labels for p in c[1] ]

pd.DataFrame(analysis , columns=["Company", "Partner", "Nb tweets", "Positivity (-1, 1)", "Subjectivity (0, 1)"])

Unnamed: 0,Company,Partner,Nb tweets,"Positivity (-1, 1)","Subjectivity (0, 1)"
0,VMware,DELLEMC,42,0.246074,0.416138
1,VMware,Delltech,20,0.318819,0.442222
2,Generalelectric,Shinola,3,0.1,0.366667
3,MaerskLine,IBMblockchain,6,0.2375,0.326389
4,Alstom,hydrogenics,3,0.548611,0.761111
5,Alstom,thecosmocompany,2,0.159091,0.238636
6,Alstom,Frauschersensor,2,0.0,0.0
7,Intel,BMW,9,0.229428,0.465438
8,Intel,Dell,7,0.259524,0.419048
9,Intel,NASA,9,0.086343,0.423611


## Show tweets debug

In [7]:
def show_tweets(company, partner):
    query =  "SELECT DISTINCT text FROM tweet INNER JOIN user ON user_id = user.id "
    query += "WHERE screen_name = '" + company + "' AND instr(text, '" + partner + "');"
    data = Preprocessing.execute(query)
    data = list(map(lambda x: x[0], data))
    data = [ t for t in data if detect(t) == "en" ]
    return data
    
show_tweets("Airliquidegroup", "AMA_SAfrance")

[".@ubleam, @AMA_SAfrance, @Imag_ing, @ZelrosAI, it's just a snapshot of the #startups coming with us @VivaTech! ????… https://t.co/mEfuLv5oPC",
 ".@ubleam, @AMA_SAfrance, @Imag_ing, @Zelros, it's just a snapshot of the #startups coming with us @VivaTech! ????… https://t.co/yS7uVuYRQB"]

## Application on companies in general

In [8]:
companies = Preprocessing.execute("SELECT DISTINCT searchterm FROM tweet;")
companies = list(map(lambda x: x[0][1:], companies))

def sentiment_analysis_company(company):

    query =  "SELECT DISTINCT text FROM tweet INNER JOIN user ON user_id = user.id "
    query += "WHERE screen_name = '" + company +  "';"
    data = Preprocessing.execute(query)
    data = list(map(lambda x: x[0], data))
    
    s_data = []
    for t in data:
        try:
            if detect(t) == "en":
                s_data.append(t)
        except:
            continue
    data = s_data
    
    data = [ t for t in data if detect(t) == "en" ]

    # remove @, #, RT and urls
    data = [ t.replace("@", "") for t in data ]
    data = [ t.replace("#", "") for t in data ]
    data = [ t.replace("RT", "") for t in data ]
    url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    data = [ re.sub(url, '', t) for t in data ]

    sent = [ TextBlob(t).sentiment for t in data ]
    return [company, len(data),
           np.mean(list(map(lambda x: x[0], sent))),
           np.mean(list(map(lambda x: x[1], sent)))]

In [9]:
analysis = [ sentiment_analysis_company(c) for c in companies ]

pd.DataFrame(analysis , columns=["Company", "Nb tweets", "Positivity (-1, 1)", "Subjectivity (0, 1)"])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Company,Nb tweets,"Positivity (-1, 1)","Subjectivity (0, 1)"
0,Intel,1142,0.157972,0.374836
1,Salesforce,2784,0.175995,0.347596
2,Adobe,1990,0.164533,0.376345
3,Capgemini,7987,0.138839,0.311408
4,Forrester,1813,0.128014,0.329178
5,Cisco,860,0.139769,0.310056
6,Generalelectric,415,0.111919,0.38748
7,Alstom,275,0.168452,0.319895
8,Oracle,2197,0.161576,0.296237
9,MaerskLine,377,0.168309,0.331255
