###### Se van a leer los archivos descargados (corpus) con tweets anotados en inglés y en español, para generar a partir de ellos un dataset para el entrenamiento de modelos, uno por idioma. Se generará un fichero csv con los tweets anotados en inglés, y otro csv para los tweets anotados en español.

In [1]:
# imports necesarios
import pandas as pd
import csv
import xml.etree.ElementTree as etree

from collections import Counter

from sklearn.preprocessing import LabelEncoder

#### Primero cargamos los datos de tweets con sentimiento en español, que vienen en ficheros xml.

Generamos unas funciones para tratar los archivos en formato xml, y devolver a partir de ellos dataframes.

In [2]:
# declaración de funciones para cargar los xml y generar dataframes con sus datos.
def cargar_xml(ruta_fichero, opcion):
    tree = etree.parse(ruta_fichero)
    root = tree.getroot()
    data = []
    
    # para cada linea del xml, nos quedamos con el texto, y con el valor del sentimiento que son los dos datos
    # que necesitamos posteriormente. Con esos dos atributos por registro, generamos el dataframe.
    for tweet in root:
        content = tweet.find('content').text
        if opcion == 'A':
            polarityValue = tweet.find('sentiments/polarity/value').text
        else:
            polarityValue = tweet.find('sentiment/polarity/value').text
        data.append([content.replace('\n',' '), polarityValue])  
    
    df = pd.DataFrame(data)
    df.columns = ['text', 'sentiment']
    
    return df


# creamos otra función por ciertas diferencias que tienen algunos de los archivos xml cargados
def cargar_xml_politics(ruta_fichero):
    tree = etree.parse(ruta_fichero)
    root = tree.getroot()
    data = []

    # para cada linea del xml, nos quedamos con el texto, y con el valor del sentimiento que son los dos datos
    # que necesitamos posteriormente. Con esos dos atributos por registro, generamos el dataframe.
    for tweet in root:
        content = tweet.find('content').text
        aux = next((e for e in tweet.findall('sentiments/polarity') if e.find('entity') == None), None)
        if aux != None:
            polarityValue = aux.find('value').text
            data.append([content.replace('\n',' '), polarityValue])
    
    df = pd.DataFrame(data)
    df.columns = ['text', 'sentiment']
    
    return df

In [3]:
# usamos las funciones anteriores para generar dataframes a partir de los archivos xml
df_general_train = cargar_xml('./data/datasets/general-train-tagged-3l.xml', 'A')
df_general_test = cargar_xml('./data/datasets/general-test-tagged-3l.xml', 'A')
df_politics = cargar_xml_politics('./data/datasets/politics-test-tagged.xml')
df_intertass_dev = cargar_xml('./data/datasets/intertass-development-tagged.xml', 'B')
df_intertass_train = cargar_xml('./data/datasets/intertass-train-tagged.xml', 'B')

In [4]:
# vemos los dataframes generados para ver el número de filas y columnas de cada uno
print("General train: num_rows: %d\tColumnas: %d\n" % (df_general_train.shape[0], df_general_train.shape[1]) )
print("General test: num_rows: %d\tColumnas: %d\n" % (df_general_test.shape[0], df_general_test.shape[1]) )
print("Politics: num_rows: %d\tColumnas: %d\n" % (df_politics.shape[0], df_politics.shape[1]) )
print("Intertass development: num_rows: %d\tColumnas: %d\n" % (df_intertass_dev.shape[0], df_intertass_dev.shape[1]) )
print("Intertass train: num_rows: %d\tColumnas: %d\n" % (df_intertass_train.shape[0], df_intertass_train.shape[1]) )

General train: num_rows: 7219	Columnas: 2

General test: num_rows: 60798	Columnas: 2

Politics: num_rows: 2448	Columnas: 2

Intertass development: num_rows: 506	Columnas: 2

Intertass train: num_rows: 1008	Columnas: 2



In [5]:
# vemos si los dataframes tienen datos vacíos
print("General train:\n%s\n" % (df_general_train.isna().any()) )
print("General test:\n%s\n" % (df_general_test.isna().any()) )
print("Politics:\n%s\n" % (df_politics.isna().any()) )
print("Intertass development:\n%s\n" % (df_intertass_dev.isna().any()) )
print("Intertass train:\n%s\n" % (df_intertass_train.isna().any()) )

General train:
text         False
sentiment    False
dtype: bool

General test:
text         False
sentiment    False
dtype: bool

Politics:
text         False
sentiment    False
dtype: bool

Intertass development:
text         False
sentiment    False
dtype: bool

Intertass train:
text         False
sentiment    False
dtype: bool



In [6]:
# vemos el recuento por cada valor de la columna sentimiento para cada dataframe
langcounter_general_train = Counter(df_general_train["sentiment"])
langcounter_general_test = Counter(df_general_test["sentiment"])
langcounter_politics = Counter(df_politics["sentiment"])
langcounter_intertass_dev = Counter(df_intertass_dev["sentiment"])
langcounter_intertass_train = Counter(df_intertass_train["sentiment"])

print("Recuento datos train: " , langcounter_general_train)
print("Recuento datos train: " , langcounter_general_test)
print("Recuento datos train: " , langcounter_politics)
print("Recuento datos train: " , langcounter_intertass_dev)
print("Recuento datos train: " , langcounter_intertass_train)

Recuento datos train:  Counter({'P': 2884, 'N': 2182, 'NONE': 1483, 'NEU': 670})
Recuento datos train:  Counter({'P': 22233, 'NONE': 21416, 'N': 15844, 'NEU': 1305})
Recuento datos train:  Counter({'NEU': 933, 'N': 681, 'P': 613, 'NONE': 221})
Recuento datos train:  Counter({'N': 219, 'P': 156, 'NEU': 69, 'NONE': 62})
Recuento datos train:  Counter({'N': 418, 'P': 318, 'NONE': 139, 'NEU': 133})


In [7]:
# concatenamos los dataframes para tener uno con todos los registros
dataframes = [df_general_train, df_general_test, df_politics, df_intertass_dev, df_intertass_train]
df_result_spanish = pd.concat(dataframes)

print("Dataframe result: num_rows: %d\tColumnas: %d\n" % (df_result_spanish.shape[0], df_result_spanish.shape[1]) )

langcounter_result = Counter(df_result_spanish["sentiment"])
print("Recuento datos: " , langcounter_result)

Dataframe result: num_rows: 71979	Columnas: 2

Recuento datos:  Counter({'P': 26204, 'NONE': 23321, 'N': 19344, 'NEU': 3110})


Como se puede apreciar, los datos vienen con 4 valores posibles para el sentimiento: negativo, neutral, positivo y sin sentimiento. Nos vamos a quedar con los valores de negativo, neutro y positivo, que serán las 3 clases que tendremos en cuenta para entrenar los modelos y predecir con ellos, así que vamos a eliminar los registros con sentimiento == 'NONE'.

In [8]:
# eliminamos del dataframe final los registros con sentimiento == None
df_result_spanish = df_result_spanish[df_result_spanish.sentiment != 'NONE']

In [9]:
# otra forma de ver el reparto en cada dataframe según el valor del sentimiento
pd.value_counts(df_result_spanish['sentiment'])

P      26204
N      19344
NEU     3110
Name: sentiment, dtype: int64

In [10]:
# vistazo a los datos sin valores None
df_result_spanish.head(10)

Unnamed: 0,text,sentiment
1,@PauladeLasHeras No te libraras de ayudar me/n...,NEU
2,@marodriguezb Gracias MAR,P
3,"Off pensando en el regalito Sinde, la que se v...",N
4,Conozco a alguien q es adicto al drama! Ja ja ...,P
6,Toca @crackoviadeTV3 . Grabación dl especial N...,P
8,Buen día todos! Lo primero mandar un abrazo gr...,P
9,Desde el escaño. Todo listo para empezar #endi...,P
10,Bdías. EM no se ira de puente. Si vosotros os ...,P
11,Un sistema económico q recorta dinero para pre...,P
12,#programascambiados caca d ajuste,N


In [11]:
# convertir los valores de la columna sentimiento a valores numéricos, mejores para tratar luego con los modelos
# seguiremos la siguiente lógica: Positive = 2, Neutral = 1, Negative = 0
labelencoder = LabelEncoder().fit(df_result_spanish["sentiment"])
df_result_spanish["sentiment"] = labelencoder.transform(df_result_spanish["sentiment"])

df_result_spanish.head(10)

Unnamed: 0,text,sentiment
1,@PauladeLasHeras No te libraras de ayudar me/n...,1
2,@marodriguezb Gracias MAR,2
3,"Off pensando en el regalito Sinde, la que se v...",0
4,Conozco a alguien q es adicto al drama! Ja ja ...,2
6,Toca @crackoviadeTV3 . Grabación dl especial N...,2
8,Buen día todos! Lo primero mandar un abrazo gr...,2
9,Desde el escaño. Todo listo para empezar #endi...,2
10,Bdías. EM no se ira de puente. Si vosotros os ...,2
11,Un sistema económico q recorta dinero para pre...,2
12,#programascambiados caca d ajuste,0


In [12]:
# generar csv final con los datos de tweets con sentimiento anotado para entrenar los modelos
df_result_spanish.to_csv('./data/df_result_spanish.csv', index=False)

#### Ahora vamos a cargar los datos de tweets con sentimiento en inglés, que vienen en varios ficheros csv.

Cargamos dos dataset con conjuntos de tweets anotados en inglés, uno extraido de : https://www.crowdflower.com/wp-content/uploads/2016/07/text_emotion.csv y otro con datos sobre aerolíneas extraidos de: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [13]:
# cargamos los dos dataset
df_text_emotion = pd.read_csv('./data/datasets/text_emotion.csv', sep=',')
df_airline = pd.read_csv('./data/datasets/tweets_airlines.csv', sep=',')

print("Text emotion: num_rows: %d\tColumnas: %d\n" % (df_text_emotion.shape[0], df_text_emotion.shape[1]))
print("Tweets airline: num_rows: %d\tColumnas: %d\n" % (df_airline.shape[0], df_airline.shape[1]))

Text emotion: num_rows: 40000	Columnas: 4

Tweets airline: num_rows: 14640	Columnas: 15



In [14]:
df_text_emotion.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [15]:
df_airline.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [16]:
# vemos los distintos valores que tiene la columna del sentimiento en ambos datasets
print("Valores dataset text emotion:\n",pd.value_counts(df_text_emotion['sentiment']))
print("\nValores dataset airline:\n",pd.value_counts(df_airline['airline_sentiment']))

Valores dataset text emotion:
 neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

Valores dataset airline:
 negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64


Como vemos, el sentimiento viene identificado por varios valores en el dataset de text emotion, por lo tanto vamos a agrupar el atributo sentimiento en valores positivos, negativos y neutros, de la siguiente forma:
- Positivo = [entusiasmo, amor, diversion, felicidad, alivio]
- Negativo = [vacio, tristeza, preocupacion, odio, sorpresa, aburrimiento, enfado]
- Neutral = [neutral]

In [17]:
# Asignamos igual que al anterior dataset los siguientes valores numéricos: 
# positivo=2, neutral=1, negativo=0
review = []

for item in df_text_emotion['sentiment']:
    if(item == 'enthusiasm' or item == 'love' or item == 'fun' or item == 'happiness' or item == 'relief'):
        review.append(2)
    if(item == 'empty' or item == 'sadness' or item == 'worry' or item == 'hate' or item == 'surprise' or
        item == 'boredom' or item == 'anger'):
        review.append(0)
    elif(item == 'neutral'): 
        review.append(1)    
        
df_text_emotion["sentiment_number"] = review

In [18]:
df_text_emotion.head()

Unnamed: 0,tweet_id,sentiment,author,content,sentiment_number
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,0
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,0
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,0
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,2
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,1


In [19]:
# usamos labelEnconder para otorgar valores numéricos al sentimiento del dataframe de airline
labelencoder = LabelEncoder().fit(df_airline["airline_sentiment"])
df_airline["airline_sentiment"] = labelencoder.transform(df_airline["airline_sentiment"])

df_airline.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,1,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,2,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,1,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,0,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,0,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [20]:
# nos quedamos con las dos columnas que nos interesan, el texto y el sentimiento 
df_text_emotion = df_text_emotion[['content','sentiment_number']]
df_text_emotion.columns = ['text','sentiment']
df_text_emotion.head()

Unnamed: 0,text,sentiment
0,@tiffanylue i know i was listenin to bad habi...,0
1,Layin n bed with a headache ughhhh...waitin o...,0
2,Funeral ceremony...gloomy friday...,0
3,wants to hang out with friends SOON!,2
4,@dannycastillo We want to trade with someone w...,1


In [21]:
# nos quedamos también con los datos que nos interesan del dataset de airline
df_airline = df_airline[['text','airline_sentiment']]
df_airline.columns = ['text','sentiment']
df_airline.head()

Unnamed: 0,text,sentiment
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials t...,2
2,@VirginAmerica I didn't today... Must mean I n...,1
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0


#### Cargamos otros 5 datasets con tweets en ingles que se han descargado desde internet.

En cada uno de los 5 datasets vemos primero los datos que contiene, nos quedamos con las columnas del texto y del sentimiento en cada uno que son las que nos interesan.

En cada dataset debemos terminar teniendo los posibles siguientes valores en la columna 'sentimiento': 0, 1 o 2 (negativo, neutral, positivo).

In [22]:
# cargamos los nuevos datasets
df_apple = pd.read_csv('./data/datasets/full-corpus.csv', sep=',')
df_reviews = pd.read_csv('./data/datasets/testdata.manual.2009.06.14.csv', sep=',', header=None)
df_general = pd.read_csv('./data/datasets/train.csv', sep=',', encoding = "ISO-8859-1")
df_reviews_full = pd.read_csv('./data/datasets/training.1600000.processed.noemoticon.csv', sep=',', encoding = "ISO-8859-1", header=None)
df_weather = pd.read_csv('./data/datasets/weather_agg_dfe.csv', sep=',')

print("Opiniones Apple: num_rows: %d\tColumnas: %d\n" % (df_apple.shape[0], df_apple.shape[1]))
print("Datos reviews: num_rows: %d\tColumnas: %d\n" % (df_reviews.shape[0], df_reviews.shape[1]))
print("Dataframe general: num_rows: %d\tColumnas: %d\n" % (df_general.shape[0], df_general.shape[1]))
print("Datos reviews full: num_rows: %d\tColumnas: %d\n" % (df_reviews_full.shape[0], df_reviews_full.shape[1]))
print("Datos wheater: num_rows: %d\tColumnas: %d\n" % (df_weather.shape[0], df_weather.shape[1]))

Opiniones Apple: num_rows: 5113	Columnas: 5

Datos reviews: num_rows: 498	Columnas: 6

Dataframe general: num_rows: 99989	Columnas: 3

Datos reviews full: num_rows: 1600000	Columnas: 6

Datos wheater: num_rows: 1000	Columnas: 10



Vamos a ver los primeros registros de cada dataframe cargado anteriormente, y nos quedaremos en cada uno con las columnas que nos interesan: texto y sentimiento.

Además, veremos para cada uno el número de registros de cada clase de sentimiento, y haremos los cambios necesarios para acabar teniendo cada dataframe solo con valores de sentimiento 0, 1 y 2 (negativo, neutro y positivo).

In [23]:
df_apple.head()

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126404574230740992,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,126395626979196928,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [24]:
df_apple = df_apple[['TweetText','Sentiment']]
df_apple.columns = ['text','sentiment']
df_apple.head()

Unnamed: 0,text,sentiment
0,Now all @Apple has to do is get swype on the i...,positive
1,@Apple will be adding more carrier support to ...,positive
2,Hilarious @youtube video - guy does a duet wit...,positive
3,@RIM you made it too easy for me to switch to ...,positive
4,I just realized that the reason I got into twi...,positive


In [25]:
print("Valores dataset:\n",pd.value_counts(df_apple['sentiment']))

Valores dataset:
 neutral       2333
irrelevant    1689
negative       572
positive       519
Name: sentiment, dtype: int64


In [26]:
# eliminamos los datos con sentimiento irrelevant, que no aplicaría en nuestro caso
df_apple = df_apple[df_apple['sentiment']!='irrelevant']

print("Valores dataset:\n",pd.value_counts(df_apple['sentiment']))

Valores dataset:
 neutral     2333
negative     572
positive     519
Name: sentiment, dtype: int64


In [27]:
# Asignamos igual que al anterior dataset los siguientes valores numéricos: 
# positivo=2, neutral=1, negativo=0
review = []

for item in df_apple['sentiment']:
    if(item == 'neutral'):
        review.append(1)
    if(item == 'positive'):
        review.append(2)
    elif(item == 'negative'): 
        review.append(0)    
        
df_apple["sentiment"] = review

df_apple.head()

Unnamed: 0,text,sentiment
0,Now all @Apple has to do is get swype on the i...,2
1,@Apple will be adding more carrier support to ...,2
2,Hilarious @youtube video - guy does a duet wit...,2
3,@RIM you made it too easy for me to switch to ...,2
4,I just realized that the reason I got into twi...,2


In [28]:
# vemos los primeros datos del dataframe df_reviews
df_reviews.head()

Unnamed: 0,0,1,2,3,4,5
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [29]:
# asignamos nombre a las columnas del dataframe, que sean más legibles y sencillas para trabajar con ellas
df_reviews.columns = ['sentiment','id','date','topic','user','text']

In [30]:
df_reviews = df_reviews[['text','sentiment']]
df_reviews.head()

Unnamed: 0,text,sentiment
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4
1,Reading my kindle2... Love it... Lee childs i...,4
2,"Ok, first assesment of the #kindle2 ...it fuck...",4
3,@kenburbary You'll love your Kindle2. I've had...,4
4,@mikefish Fair enough. But i have the Kindle2...,4


In [31]:
print("Valores dataset:\n",pd.value_counts(df_reviews['sentiment']))

Valores dataset:
 4    182
0    177
2    139
Name: sentiment, dtype: int64


In [32]:
# Asignamos igual que al anterior dataset los siguientes valores numéricos: 
# positivo=2, neutral=1, negativo=0
review = []

for item in df_reviews['sentiment']:
    if(item == 2):
        review.append(1)
    if(item == 4):
        review.append(2)  
    elif(item == 0):
        review.append(0) 
        
df_reviews["sentiment"] = review

df_reviews.head()

Unnamed: 0,text,sentiment
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,2
1,Reading my kindle2... Love it... Lee childs i...,2
2,"Ok, first assesment of the #kindle2 ...it fuck...",2
3,@kenburbary You'll love your Kindle2. I've had...,2
4,@mikefish Fair enough. But i have the Kindle2...,2


In [33]:
print("Valores dataset:\n",pd.value_counts(df_reviews['sentiment']))

Valores dataset:
 2    182
0    177
1    139
Name: sentiment, dtype: int64


In [34]:
# vemos los primeros registros del siguiente dataframe
df_general.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [35]:
df_general = df_general[['SentimentText','Sentiment']]
df_general.columns = ['text','sentiment']
df_general.head()

Unnamed: 0,text,sentiment
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0


In [36]:
print("Valores dataset:\n",pd.value_counts(df_general['sentiment']))

Valores dataset:
 1    56457
0    43532
Name: sentiment, dtype: int64


In [37]:
# Asignamos igual que al anterior dataset los siguientes valores numéricos: 
# positivo=2, neutral=1, negativo=0
review = []

for item in df_general['sentiment']:
    if(item == 1):
        review.append(2)
    elif(item == 0):
        review.append(0) 
        
df_general["sentiment"] = review

df_general.head()

Unnamed: 0,text,sentiment
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,2
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0


In [38]:
# vemos los primeros registros del dataframe df_reviews_full
df_reviews_full.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [39]:
# asignamos nombre a las columnas del dataframe, que sean más legibles y sencillas para trabajar con ellas
df_reviews_full.columns = ['sentiment','id','date','topic','user','text']

In [40]:
df_reviews_full = df_reviews_full[['text','sentiment']]
df_reviews_full.head()

Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [41]:
print("Valores dataset:\n",pd.value_counts(df_reviews_full['sentiment']))

Valores dataset:
 4    800000
0    800000
Name: sentiment, dtype: int64


In [42]:
# asignamos el valor 2 para registros con sentimiento positivo, y 0 para registros con sentimiento negativo
review = []

for item in df_reviews_full['sentiment']:
    if(item == 4):
        review.append(2)
    elif(item == 0):
        review.append(0) 
        
df_reviews_full["sentiment"] = review

df_reviews_full.head()

Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [43]:
print("Valores dataset:\n",pd.value_counts(df_reviews_full['sentiment']))

Valores dataset:
 2    800000
0    800000
Name: sentiment, dtype: int64


In [44]:
# vemos los primeros registros para el df_weather
df_weather.head()

Unnamed: 0,unit_id,canary,unit_state,trusted_judgments,last_judgment_at,what_emotion_does_the_author_express_specifically_about_the_weather,what_emotion_does_the_author_express_specifically_about_the_weather_confidence,gold_answer,tweet_id,tweet_text
0,314960380,,finalized,20,2013-08-24T00:21:00,Positive,0.8439,,81990560,Grilling kabobs on the grill last night was am...
1,314960381,,finalized,20,2013-08-24T00:49:00,Negative,0.6963,,84314377,The slowest day ever !! And the weather makes ...
2,314960382,,finalized,20,2013-08-24T00:55:00,Neutral / author is just sharing information,0.8802,,82846118,Fire Weather Watch issued May 17 at 4:21PM CDT...
3,314960383,,finalized,20,2013-08-24T00:48:00,Positive,0.6897,,82843785,Im going to lunch early today. The weather i...
4,314960384,,finalized,20,2013-08-24T01:19:00,Neutral / author is just sharing information,0.6153,,82840144,Weekend Weather Causes Delays In I-270 Bridge ...


In [45]:
# asignamos nombre a las columnas del dataframe, que sean más legibles y sencillas para trabajar con ellas
df_weather.columns = ['id','canary','state','trusted','date','sentiment','value','answer','tweet_id','text']

In [46]:
df_weather = df_weather[['text','sentiment']]
df_weather.head()

Unnamed: 0,text,sentiment
0,Grilling kabobs on the grill last night was am...,Positive
1,The slowest day ever !! And the weather makes ...,Negative
2,Fire Weather Watch issued May 17 at 4:21PM CDT...,Neutral / author is just sharing information
3,Im going to lunch early today. The weather i...,Positive
4,Weekend Weather Causes Delays In I-270 Bridge ...,Neutral / author is just sharing information


In [47]:
print("Valores dataset:\n",pd.value_counts(df_weather['sentiment']))

Valores dataset:
 Negative                                        271
Neutral / author is just sharing information    261
Tweet not related to weather condition          235
Positive                                        231
I can't tell                                      2
Name: sentiment, dtype: int64


In [48]:
# según los valores vistos en la columna sentimiento, transformamos para tener los valores como en el resto de datos:
# 0==negativo, 1==neutro, 2==positivo.
review = []

for item in df_weather['sentiment']:
    if(item == 'Negative'):
        review.append(0)
    if(item == 'Positive'):
        review.append(2)
    if(item == 'Neutral / author is just sharing information'):
        review.append(1)
    if(item == 'Tweet not related to weather condition'):
        review.append(3)
    elif(item == "I can't tell"):
        review.append(3)
        
df_weather["sentiment"] = review

df_weather.head()

Unnamed: 0,text,sentiment
0,Grilling kabobs on the grill last night was am...,2
1,The slowest day ever !! And the weather makes ...,0
2,Fire Weather Watch issued May 17 at 4:21PM CDT...,1
3,Im going to lunch early today. The weather i...,2
4,Weekend Weather Causes Delays In I-270 Bridge ...,1


In [49]:
# eliminamos los valores que sean igual a 3, ya que no vamos a tener en cuenta el sentimiento irrelevante o ausente
df_weather = df_weather[df_weather['sentiment']!=3]

print("Valores dataset:\n",pd.value_counts(df_weather['sentiment']))

Valores dataset:
 0    271
1    261
2    231
Name: sentiment, dtype: int64


#### Una vez tenemos todos los dataframes de datos en inglés, con el texto y sentimiento de cada tweet, vamos a concatenar los 7 dataframes, y generar el dataframe final que será el que guardemos como archivo csv y posteriormente nos servirá para entrenar los modelos de aprendizaje supervisado.

In [50]:
# vemos si hay valores NaN en los dataframes
print("General train:\n%s\n" % (df_airline.isna().any()) )
print("General test:\n%s\n" % (df_text_emotion.isna().any()) )
print("Politics:\n%s\n" % (df_apple.isna().any()) )
print("Intertass development:\n%s\n" % (df_reviews.isna().any()) )
print("Intertass train:\n%s\n" % (df_general.isna().any()) )
print("Intertass development:\n%s\n" % (df_reviews_full.isna().any()) )
print("Intertass train:\n%s\n" % (df_weather.isna().any()) )

General train:
text         False
sentiment    False
dtype: bool

General test:
text         False
sentiment    False
dtype: bool

Politics:
text         False
sentiment    False
dtype: bool

Intertass development:
text         False
sentiment    False
dtype: bool

Intertass train:
text         False
sentiment    False
dtype: bool

Intertass development:
text         False
sentiment    False
dtype: bool

Intertass train:
text         False
sentiment    False
dtype: bool



In [51]:
# por si acaso eliminamos datos vacíos o NaN
df_airline.dropna()
df_text_emotion.dropna()
df_apple.dropna()
df_reviews.dropna()
df_general.dropna()
df_reviews_full.dropna()
df_weather.dropna()

Unnamed: 0,text,sentiment
0,Grilling kabobs on the grill last night was am...,2
1,The slowest day ever !! And the weather makes ...,0
2,Fire Weather Watch issued May 17 at 4:21PM CDT...,1
3,Im going to lunch early today. The weather i...,2
4,Weekend Weather Causes Delays In I-270 Bridge ...,1
5,Passing out now. working tonight. Storms toda...,2
6,US GAS: Warm-Weather Forecasts Lift Natural Ga...,1
7,@mention s friday at 6 at tha stadium ...if th...,0
8,I hate this weather. Good day for a movie mara...,0
9,TGif biatches! Here's to a sunny Friday regard...,2


In [52]:
# concatenamos en el dataframe final
dataframes_english = [df_airline, df_text_emotion, df_apple, df_reviews, df_general, df_reviews_full, df_weather ]
df_result_english = pd.concat(dataframes_english)

print("Dataframe english result: num_rows: %d\tColumnas: %d\n" % \
      (df_result_english.shape[0], df_result_english.shape[1]) )

labelencoder = LabelEncoder().fit(df_result_english["sentiment"])
df_result_english["sentiment"] = labelencoder.transform(df_result_english["sentiment"])

Dataframe english result: num_rows: 1759314	Columnas: 2



In [53]:
df_result_english.head(10)

Unnamed: 0,text,sentiment
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials t...,2
2,@VirginAmerica I didn't today... Must mean I n...,1
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0
5,@VirginAmerica seriously would pay $30 a fligh...,0
6,"@VirginAmerica yes, nearly every time I fly VX...",2
7,@VirginAmerica Really missed a prime opportuni...,1
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",2
9,"@VirginAmerica it was amazing, and arrived an ...",2


In [54]:
# recuento de registros por valor del sentimiento del dataframe final
print("Valores dataset:\n",pd.value_counts(df_result_english['sentiment']))

Valores dataset:
 2    872864
0    871980
1     14470
Name: sentiment, dtype: int64


In [55]:
# generar csv final con los datos de tweets en inglés con sentimiento anotado para entrenar los modelos
df_result_english.to_csv('./data/df_result_english.csv', index=False)

#### Vamos ahora a generar otros dos dataframes para inglés y para español, intentando tener el mismo número de datos entre las distintas clases (negativo, neutral y positivo), y otro donde no haya datos con sentimiento neutro y tengamos un equilibrio 50% entre sentimiento negativo y sentimiento positivo.

Estos datasets podremos también probarlos con algún modelo, y así ver si influye el entrenar y testear un modelo con conjuntos de datasets solo con 2 clases en lugar de 3, o con una distribución de datos más uniforme.

In [56]:
# datasets full para cada idioma
print("Valores dataset:\n",pd.value_counts(df_result_english['sentiment']))
print("\n")
print("Valores dataset:\n",pd.value_counts(df_result_spanish['sentiment']))

Valores dataset:
 2    872864
0    871980
1     14470
Name: sentiment, dtype: int64


Valores dataset:
 2    26204
0    19344
1     3110
Name: sentiment, dtype: int64


In [57]:
# generamos un DF copia de los full de cada idioma
df_english_copy = df_result_english.copy()
df_spanish_copy = df_result_spanish.copy()

In [58]:
# eliminamos los datos con sentimiento neutro en ambos datasets
df_english_noNeutral = df_english_copy[df_english_copy['sentiment']!=1]
df_spanish_noNeutral = df_spanish_copy[df_spanish_copy['sentiment']!=1]

# separamos cada dataframe sin neutros en otros con valores negativos y positivos
df_english_noNeutral_pos = df_english_noNeutral[df_english_noNeutral['sentiment']==2]
df_english_noNeutral_neg = df_english_noNeutral[df_english_noNeutral['sentiment']==0]
df_spanish_noNeutral_pos = df_spanish_noNeutral[df_spanish_noNeutral['sentiment']==2]
df_spanish_noNeutral_neg = df_spanish_noNeutral[df_spanish_noNeutral['sentiment']==0]

In [59]:
# nos quedamos con un sample por clase para tener el mismo número de registros, y luego mergeamos los DFs.
# Viendo el recuento por clase original, vamos a quedarnos con 40.000 registros positivos y 40.000 negativos 
# para el DF equilibrado en inglés, y con 19.000 registros positivos y 19.000 negativos para el DF español.
df_sample_english_pos = df_english_noNeutral_pos.sample(n=40000, replace=False)
df_sample_english_neg = df_english_noNeutral_neg.sample(n=40000, replace=False)
df_sample_spanish_pos = df_spanish_noNeutral_pos.sample(n=19000, replace=False)
df_sample_spanish_neg = df_spanish_noNeutral_neg.sample(n=19000, replace=False)

# mergeamos para los dfs finales
dataframes_english_noNeutral = [df_sample_english_pos, df_sample_english_neg]
df_result_english_noNeutral = pd.concat(dataframes_english_noNeutral)

dataframes_spanish_noNeutral = [df_sample_spanish_pos, df_sample_spanish_neg]
df_result_spanish_noNeutral = pd.concat(dataframes_spanish_noNeutral)

print("Dataframe english no neutral result: num_rows: %d\tColumnas: %d\n" % \
      (df_result_english_noNeutral.shape[0], df_result_english_noNeutral.shape[1]) )
print("\n")
print("Dataframe spanish no neutral result: num_rows: %d\tColumnas: %d\n" % \
      (df_result_spanish_noNeutral.shape[0], df_result_spanish_noNeutral.shape[1]) )
print("\n")
print("Valores dataset english no neutral:\n",pd.value_counts(df_result_english_noNeutral['sentiment']))
print("\n")
print("Valores dataset spanish no neutral:\n",pd.value_counts(df_result_spanish_noNeutral['sentiment']))

Dataframe english no neutral result: num_rows: 80000	Columnas: 2



Dataframe spanish no neutral result: num_rows: 38000	Columnas: 2



Valores dataset english no neutral:
 2    40000
0    40000
Name: sentiment, dtype: int64


Valores dataset spanish no neutral:
 2    19000
0    19000
Name: sentiment, dtype: int64


In [60]:
# generamos un dataframe por idioma solo con los datos neutros de cada uno
df_english_neu = df_english_copy[df_english_copy['sentiment']==1]
df_spanish_neu = df_spanish_copy[df_spanish_copy['sentiment']==1]

In [61]:
# nos quedamos con un sample por clase para tener el mismo número de registros y luego mergeamos los DFs
# Viendo el recuento por clase original, vamos a quedarnos con 14.000 registros positivos, 14.000 negativos 
# y 14.000 neutros para el DF equilibrado en inglés, y con 3000 registros positivos, 3000 negativos y 
# 3000 neutros para el DF español.
df_sample_english_pos = df_english_noNeutral_pos.sample(n=14000, replace=False)
df_sample_english_neg = df_english_noNeutral_neg.sample(n=14000, replace=False)
df_sample_english_neu = df_english_neu.sample(n=14000, replace=False)
df_sample_spanish_pos = df_spanish_noNeutral_pos.sample(n=3000, replace=False)
df_sample_spanish_neg = df_spanish_noNeutral_neg.sample(n=3000, replace=False)
df_sample_spanish_neu = df_spanish_neu.sample(n=3000, replace=False)

# mergeamos para los dfs finales
dataframes_english_neutral = [df_sample_english_pos, df_sample_english_neg, df_sample_english_neu]
df_result_english_neutral = pd.concat(dataframes_english_neutral)

dataframes_spanish_neutral = [df_sample_spanish_pos, df_sample_spanish_neg, df_sample_spanish_neu]
df_result_spanish_neutral = pd.concat(dataframes_spanish_neutral)

print("Dataframe english neutral result: num_rows: %d\tColumnas: %d\n" % \
      (df_result_english_neutral.shape[0], df_result_english_neutral.shape[1]) )
print("\n")
print("Dataframe spanish neutral result: num_rows: %d\tColumnas: %d\n" % \
      (df_result_spanish_neutral.shape[0], df_result_spanish_neutral.shape[1]) )
print("\n")
print("Valores dataset english neutral:\n",pd.value_counts(df_result_english_neutral['sentiment']))
print("\n")
print("Valores dataset spanish neutral:\n",pd.value_counts(df_result_spanish_neutral['sentiment']))

Dataframe english neutral result: num_rows: 42000	Columnas: 2



Dataframe spanish neutral result: num_rows: 9000	Columnas: 2



Valores dataset english neutral:
 2    14000
1    14000
0    14000
Name: sentiment, dtype: int64


Valores dataset spanish neutral:
 2    3000
1    3000
0    3000
Name: sentiment, dtype: int64


In [62]:
# generamos los csv para cada idioma, uno con el mismo número de registros con sentimiento positivo y negativo,
# y otro con un reparto muy similar de positivo, negativo y neutral.
df_result_english_neutral.to_csv('./data/df_result_english_neutral.csv', index=False)
df_result_english_noNeutral.to_csv('./data/df_result_english_noNeutral.csv', index=False)
df_result_spanish_neutral.to_csv('./data/df_result_spanish_neutral.csv', index=False)
df_result_spanish_noNeutral.to_csv('./data/df_result_spanish_noNeutral.csv', index=False)