In [1]:
#Seccion de importacion
import pandas as pd
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib



In [2]:
#Rutas y Variables Globales
com_neg='./Dataset/tweets_neg_clean.txt'
com_pos='./Dataset/tweets_pos_clean.txt'
spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
stemmer = SnowballStemmer('spanish')

In [3]:
#Carga de DataSets
df1 = open(com_neg, "r",encoding="utf8")
neg = df1.readlines()
df1.close()

df2 = open(com_pos, "r",encoding="utf8")
pos = df2.readlines()
df2.close()

In [4]:
#Tratmiento para convertir DataFrame
def dfConverter(com,cla):
    dfG = pd.DataFrame(com) 
    dfG["Class"]=cla
    #dfG.rename(index={0: "Comments",1: "Comments"})
    return dfG
    

In [5]:
def dfUnion(df1,df2):
    dfFinal=df1.append(df2, ignore_index=True)
    #Renombramos bien las columnas
    dfFinal.columns=['Comments', 'Class']
    return dfFinal

In [6]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)
    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [7]:
#Voy a convertir a numero la clasificacion donde 0 sera negativo y 1 positivo

dfNeg = dfConverter(neg,0);
print(dfNeg.shape)

dfPos=dfConverter(pos,1);
print(dfPos.shape)

#Ahora unimos ambos dataSet
dfFinal=dfUnion(dfNeg,dfPos);
dfFinal.shape

(122216, 2)
(55360, 2)


(177576, 2)

In [8]:
dfFinal

Unnamed: 0,Comments,Class
0,Cordobés porque me la complicaste con el cosen...,0
1,Tengo fiebre :(\n,0
2,@sooooyderiver dame bola :(\n,0
3,Quiero bailar salsa :(\n,0
4,Metal Gear. :(\n,0
...,...,...
177571,Acá cocinandole a mi Hno :)\n,1
177572,"@nyazfthes Gracias por seguirme, en breve te d...",1
177573,Y ahora no podré dormir :)))))))\n,1
177574,En la Boca :) @ Estadio Boca Juniors https://t...,1


In [9]:
#Separamos dataset en train y test
X_train, X_test, y_train, y_test = train_test_split(
    dfFinal['Comments'], dfFinal['Class'], test_size = 0.20, stratify=dfFinal['Class'], random_state = 12)

In [10]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

65153     🎡 @Estrella_Pique 🎡.-5./Tu icon, ti bio, me pu...
165250                   Tal vez en la siguiente vida. :)\n
21427                      @belenrojas67 amiga te creo :(\n
86263     Estoy peor que una niña de 10 años cuando le g...
160265    Cortaste tan profundo... que te  amputaste de ...
                                ...                        
103856    Que ladilla no tener renta y que cuando agarra...
92569                            que pereza mas tareas :(\n
155855             El jueves es el cumpleaños de Aguus :)\n
116858    513. Bueno bueno, eres maja, te gusta Digimon,...
120516    Jovani, quitame el block :( https://t.co/I2kGw...
Name: Comments, Length: 142060, dtype: object
167493    @pupimartinezok @Agustin_Ocampo Es mucho traba...
37578     Jeje no he hecho desde antier :( https://t.co/...
172278             Se aceptan facturas por las mañanas :)\n
86717     #DomingoDeGanarSeguidores que quieran un monit...
81576                     @CRUSTACEO ay ya no me pelas

In [11]:
#Hacemos uso del TFIDF
vectorizer = TfidfVectorizer(analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

In [12]:
X_train_vectorizer = vectorizer.fit_transform(X_train)
X_test_vectorizer = vectorizer.transform(X_test)

  'stop_words.' % sorted(inconsistent))


In [13]:
#Creamos el modelos que sera una red neuronal multicapa dentro de un gridsearch
parametersMLP = [
              {'hidden_layer_sizes': [(10,)]}
              ]

clfMLP = MLPClassifier()

In [14]:
grid_searchMLP = GridSearchCV(estimator=clfMLP, param_grid=parametersMLP, cv=5, scoring='roc_auc')
grid_searchMLP.fit(X_train_vectorizer, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'hidden_layer_sizes': [(10,)]}],
       

In [15]:
#Me interesa saber cual es el mejor parametro
grid_searchMLP.best_params_

{'hidden_layer_sizes': (10,)}

In [16]:
optimised_MLP = grid_searchMLP.best_estimator_
y_pred_opt_MLP = optimised_MLP.predict(X_test_vectorizer)

In [17]:
print(classification_report(y_test, y_pred_opt_MLP))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82     24444
           1       0.61      0.58      0.60     11072

    accuracy                           0.75     35516
   macro avg       0.71      0.71      0.71     35516
weighted avg       0.75      0.75      0.75     35516



In [52]:
def showPrediction(newcomments):
    tfidf_vect=vectorizer
    comments_new_counts = tfidf_vect.transform(newcomments)
    comments_new_tfidf = tfidf_vect.transform(newcomments)
    pred = optimised_MLP.predict(comments_new_tfidf)
    for com, classi in zip(newcomments, pred):
        if classi==0: 
            clasString='Negativo' 
        else: 
            clasString='Positivo'
        print('%r => %s \n' % (com, clasString))

In [55]:
#Creamos una lista con los nuevos comentarios para clasificar
newcomments = ["Te odio",
              "Estoy feliz",
              "Tengo hambre",
              "Me gustaria que la gente piense mas en el projimo y deje de pensar tanto en si mismo"]

#Y los mostramos con la funcion definida arriba
showPrediction(newcomments)

'Te odio' => Negativo 

'Estoy feliz' => Positivo 

'Tengo hambre' => Negativo 

'Me gustaria que la gente piense mas en el projimo y deje de pensar tanto en si mismo' => Positivo 



In [20]:
##Exportamos el modelo a pkl
joblib.dump(optimised_MLP, 'sentiment.pkl') 

['sentiment.pkl']