In [2]:
import sys; import os; sys.path.insert(0, os.path.abspath('../extraction'))

In [3]:
import pandas as pd
import numpy as np
import re
import preprocessing

pd.set_option('max_colwidth', 100)
pd.set_option('display.width', 2000)

## Importation des données brutes et Prétraitement

### Lecture du DataFrame

In [5]:
df = preprocessing.read_txt_file()
df.head(5)

Unnamed: 0,Avis,Entreprise,Tweet
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superficie en una pantalla multitactil. http://t.co/EDibLL5...
2,neu,ggl,Some people should not post replies in #Google+ threads. Their posts only continue to weaken the...
3,neg,apl,I know a few others having same issue RT @Joelplane: 9% now on my second full charge of the day....
4,neg,msf,"#Microsoft - We put the """"backwards"""" into backwards compatibility. #instantfollowback"


### Traitement des Hashtages, mentions, liens hypertexte et emojis

In [6]:
df = preprocessing.preprocess_dataset(df)
df.head(5)



Unnamed: 0,Avis,Entreprise,Tweet,Hashtags,Mentions,Language,LanguageProbability,irr
0,neu,apl,20 min line apple store short pump.,[],"[apple, short]",en,0.670655,False
1,irr,msf,Nueva tecnología convierte cualquier superficie en una pantalla multitactil. _LINK_ Microsoft om...,"[Microsoft, omnitouch]",[],es,0.876679,True
2,neu,ggl,Some people should not post replies in Google+ threads. Their posts only continue to weaken thei...,[Google+],[],en,0.98997,False
3,neg,apl,I know a few others having same issue RT Joelplane: 9% now on my second full charge of the day. ...,[],"[Joelplane, Apple]",en,0.973644,False
4,neg,msf,"Microsoft - We put the """"backwards"""" into backwards compatibility. instantfollowback","[Microsoft, instantfollowback]",[],en,0.835478,False


## Classification

### Vectorisation des Tweets

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
#from tensorflow.keras.layers import Embedding, Flatten
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [6]:
_embed = hub.KerasLayer(
    "https://tfhub.dev/google/nnlm-en-dim128/2", input_shape=[], dtype=tf.string
)

In [10]:
vects = _embed(df.Tweet).numpy()
for i in range(vects.shape[1]):
    df["dim_"+str(i)] = vects[:,i]
    
df.head(2)

Unnamed: 0,Avis,Entreprise,Tweet,Hashtags,Mentions,Language,LanguageProbability,dim_0,dim_1,dim_2,...,dim_118,dim_119,dim_120,dim_121,dim_122,dim_123,dim_124,dim_125,dim_126,dim_127
0,neu,apl,20 min line apple store short pump.,[],"[apple, short]",en,0.670655,-0.068681,0.022498,-0.066325,...,-0.109218,0.038298,0.13709,-0.145238,0.034566,-0.015903,-0.129935,0.070626,0.227466,-0.086626
1,irr,msf,Nueva tecnología convierte cualquier superficie en una pantalla multitactil. _LINK_ Microsoft om...,"[Microsoft, omnitouch]",[],es,0.876679,-0.220029,-0.079786,0.149435,...,0.042704,-0.113385,0.155414,0.085385,0.07962,0.10596,0.050176,0.215923,0.173534,-0.009601


### Codification One-Hot des labels et des variables catégorielles

In [12]:
#Labels
df["is pos"] = df.Avis.apply(lambda x : True if x == "pos" else False)
df["is neg"] = df.Avis.apply(lambda x : True if x == "neg" else False)
df["is neu"] = df.Avis.apply(lambda x : True if x == "neu" else False)
df["is irr"] = df.Avis.apply(lambda x : True if x == "irr" else False)

#Variable : Entreprise
df["is apl"] = df.Entreprise.apply(lambda x : 1 if x == "apl" else 0)
df["is msf"] = df.Entreprise.apply(lambda x : 1 if x == "msf" else 0)
df["is ggl"] = df.Entreprise.apply(lambda x : 1 if x == "ggl" else 0)
df["is twt"] = df.Entreprise.apply(lambda x : 1 if x == "twt" else 0)

#Variable : Language == "en"
df["is eng"] = df.Language.apply(lambda x : 1 if x == "en" else 0)

df.head(2)

Unnamed: 0,Avis,Entreprise,Tweet,Hashtags,Mentions,Language,LanguageProbability,dim_0,dim_1,dim_2,...,dim_127,is pos,is neg,is neu,is irr,is apl,is msf,is ggl,is twt,is eng
0,neu,apl,20 min line apple store short pump.,[],"[apple, short]",en,0.670655,-0.068681,0.022498,-0.066325,...,-0.086626,False,False,True,False,1,0,0,0,1
1,irr,msf,Nueva tecnología convierte cualquier superficie en una pantalla multitactil. _LINK_ Microsoft om...,"[Microsoft, omnitouch]",[],es,0.876679,-0.220029,-0.079786,0.149435,...,-0.009601,False,False,False,True,0,1,0,0,0


### Préparation des données de Train/Test

In [14]:
df_train_test = df.copy()

#Variables explicatives : Vecteurs de Tweets + Entreprise + Tweet en langue anglaise
X = ["dim_"+str(i) for i in range(vects.shape[1])] + ["is apl", "is msf", "is ggl", "is twt", "is eng"]

#Variable à prédire : l'avis du Tweet
y = ["is pos", "is neg", "is neu", "is irr"]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train_test[X], df_train_test[y], test_size=0.2, random_state=1000)

### Construction du modèle

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

input_dim = X_train.shape[1]
lr = 0.0001

model = Sequential()
model.add(layers.Dense(50, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(30, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(len(y), activation='sigmoid'))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 50)                6700      
_________________________________________________________________
dense_13 (Dense)             (None, 30)                1530      
_________________________________________________________________
dense_14 (Dense)             (None, 10)                310       
_________________________________________________________________
dense_15 (Dense)             (None, 4)                 44        
Total params: 8,584
Trainable params: 8,584
Non-trainable params: 0
_________________________________________________________________


### Entrainement

In [22]:
history = model.fit(X_train, y_train,
                    epochs=200,
                    verbose=False,
                    validation_split=0.2,
                    batch_size=32)

### Evaluation

In [23]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.8529
Testing Accuracy:  0.7509


## Prédictions

### Importation des données de test

In [24]:
file2 = open('../data/test.txt', 'r') 
lines = file2.readlines()

ids = []
revs = []
comps = []
txts = []

for line in lines:
    ids.append(line[1:5])
    revs.append(line[6:9])
    comps.append(line[10:13])
    txts.append(line[15:])
    
df_test_raw = pd.DataFrame({"Avis" : revs, "Entreprise" : comps, "Tweet" : txts}, index = ids)
df_test = preprocessing.preprocess_dataset(df_test_raw)
df_test.head(5)



Unnamed: 0,Avis,Entreprise,Tweet,Hashtags,Mentions,Language,LanguageProbability
4173,???,msf,RT EurekaStartups Un teclado táctil sobre cualquier superficie &gt; _LINK_ Microsoft Tecnología,"[Microsoft, Tecnología]",[EurekaStartups],es,0.912599
4174,???,apl,Cool apple / Steve Jobs infographic _LINK_ MickH87 - timeline history,[],"[apple, MickH87]",en,0.374458
4175,???,ggl,Virage Success Story - Harvard Business School pdf ebook: _LINK_ google,[google],[],en,0.538483
4176,???,msf,A Child’s-Eye View Of Microsoft’s Kinect For Kids _LINK_ via TechCrunch,"[Microsoft, Kinect, Kids]",[TechCrunch],en,0.667448
4177,???,apl,"Siri is kinda slow, tho impressed it got my accent and commands first time out. apple",[Siri],[apple],en,0.949407


In [25]:
vects = _embed(df_test.Tweet).numpy()

for i in range(vects.shape[1]):
    df_test["dim_"+str(i)] = vects[:,i]
    
#Variable : Entreprise
df_test["is apl"] = df_test.Entreprise.apply(lambda x : 1 if x == "apl" else 0)
df_test["is msf"] = df_test.Entreprise.apply(lambda x : 1 if x == "msf" else 0)
df_test["is ggl"] = df_test.Entreprise.apply(lambda x : 1 if x == "ggl" else 0)
df_test["is twt"] = df_test.Entreprise.apply(lambda x : 1 if x == "twt" else 0)

#Variable : Language == "en"
df_test["is eng"] = df_test.Language.apply(lambda x : 1 if x == "en" else 0)

### Prédiction des labels

In [27]:
#prédictions des labels
pred = model.predict_classes(df_test[X])
cm = {0 : "pos", 1 : "neg", 2 : "neu", 3 : "irr"}
df_test["pred"] = [cm[p] for p in pred]

#Nombre d'occurrences par classe
df_test["pred"].value_counts()

neu    518
irr    310
neg     97
pos     75
Name: pred, dtype: int64

### Exportation des résultats

In [28]:
#Enregistrer le résultat dans le meme format que test.txt
def to_txt(in_df, raw_df):
    res = ""
    for i in range(len(in_df)):
        res += "({0},{1},{2}) {3}".format(in_df.index[i], in_df.pred[i], in_df.Entreprise[i], raw_df.Tweet[i])
    return res

save_to = "../data/test_output.txt"

with open(save_to, "w") as text_file:
    text_file.write(to_txt(df_test, df_test_raw))