## Machine Learning

In [1]:
#Importamos las librerias
import pandas as pd
import numpy as np
import re
import category_encoders as ce
import spacy
import en_core_web_sm
from spacy import displacy
from wordcloud import WordCloud
from pandas import DataFrame
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

nlp = en_core_web_sm.load()

## Funciones

In [2]:
#Funcion para definir la locacion
def MeQuedoConLocacion(Primero,Segundo,Tercero,Cuarto):
    if(pd.isna(Primero)) & (pd.isna(Segundo)) & (pd.isna(Tercero)):
        return 'Sin Locacion'
    if(pd.isna(Primero)) & (pd.isna(Segundo)) & (pd.notna(Tercero)):
        return Cuarto
    if(pd.isna(Primero)):
        return Segundo
    if(pd.isna(Segundo)):
        return Primero

### Carga de los CSVs

In [3]:
#Cargamos el set de entrenamiento
train=pd.read_csv('Data/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
#Cargamos el csv de las Ciudades
ciudades=pd.read_csv('Data/worldcities.csv')
ciudades=ciudades[['city','country']]
ciudades.head()

Unnamed: 0,city,country
0,Tokyo,Japan
1,New York,United States
2,Mexico City,Mexico
3,Mumbai,India
4,São Paulo,Brazil


In [5]:
#Cargamos el csv con los Paises del mundo
mundo=pd.read_csv('Data/Mundo.csv')
mundo=mundo[['name','iso_a3']]
mundo.head()

Unnamed: 0,name,iso_a3
0,Fiji,FJI
1,Tanzania,TZA
2,W. Sahara,ESH
3,Canada,CAN
4,United States,USA


In [6]:
#Cargamos el csv con los hashtags mas utilizados
hashtags=pd.read_csv('Data/Hashtags_Total.csv')
hashtags.set_index('Hashtag',inplace=True)
hashtags.head()

Unnamed: 0_level_0,Unnamed: 0
Hashtag,Unnamed: 1_level_1
News,1
Hot,0
Best,1609
Prebreak,1167
Nowplaying,3


In [7]:
#Cargamos el csv con las menciones mas utilizados
arrobas=pd.read_csv('Data/Arroba_Total.csv')
arrobas['Mencion']=arrobas['Mencion'].str.lower()
arrobas.set_index('Mencion',inplace=True)
arrobas.head()

Unnamed: 0_level_0,Unnamed: 0
Mencion,Unnamed: 1_level_1
youtube,0
arianagrande,5
potus,6
foxnews,7
change,1


## Set de Entrenamiento

## Generamos los Features

#### Analisis de Locacion

In [8]:
#Separo la locacion por las comas
separacion_coma=pd.concat([train, train['location'].str.split(',', expand=True)], axis=1)
separacion_coma.rename(columns={0:'Primero',1:'Segundo',2:'Tercero'},inplace=True)
separacion_coma.drop(columns={'location',3},inplace=True)
separacion_coma.head()

Unnamed: 0,id,keyword,text,target,Primero,Segundo,Tercero
0,1,,Our Deeds are the Reason of this #earthquake M...,1,,,
1,4,,Forest fire near La Ronge Sask. Canada,1,,,
2,5,,All residents asked to 'shelter in place' are ...,1,,,
3,6,,"13,000 people receive #wildfires evacuation or...",1,,,
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,,,


In [9]:
#Me quedo con la primer parte de la separación ya que vimos que es la más importante
#Mapeo para cada ciudad, su pais
primera_ciudad=separacion_coma[['id','Primero']].copy()
primera_ciudad.rename(columns={'Primero':'city'},inplace=True)
ciudad_pais=pd.merge(primera_ciudad,ciudades,on='city',how='left')
ciudad_pais.drop_duplicates(subset="id",keep="first",inplace=True)

In [10]:
#Mapeo para cada abreviacion, su pais
primera_abreviacion=separacion_coma[['id','Primero']].copy()
primera_abreviacion.rename(columns={'Primero':'iso_a3'},inplace=True)
abreviacion_pais=pd.merge(primera_abreviacion,mundo,on='iso_a3',how='left')
abreviacion_pais.drop_duplicates(subset="id",keep="first",inplace=True)

In [11]:
#Verifico si en la primer coma esta bien la locacion
primera_pais=separacion_coma[['id','Primero']].copy()
primera_pais.rename(columns={'Primero':'name'},inplace=True)
pais=pd.merge(primera_pais,mundo,on='name',how='left')
pais.drop_duplicates(subset="id",keep="first",inplace=True)
pais.rename(columns={'iso_a3':'abreviacion'},inplace=True)
pais.drop(columns={'name'},inplace=True)

In [12]:
junto=pd.merge(abreviacion_pais,ciudad_pais,on='id',how='inner')
junto=pd.merge(junto,pais,on='id',how='inner')
junto['Locacion']=junto.apply(lambda x: MeQuedoConLocacion(x['name'],x['country'],x['abreviacion'],x['iso_a3']),axis=1)
junto_final=junto[['id','Locacion']]

#Lo Joineo con el set de entrenamiento
train=pd.merge(train,junto_final,on='id',how='inner')
train.head()

Unnamed: 0,id,keyword,location,text,target,Locacion
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Sin Locacion
1,4,,,Forest fire near La Ronge Sask. Canada,1,Sin Locacion
2,5,,,All residents asked to 'shelter in place' are ...,1,Sin Locacion
3,6,,,"13,000 people receive #wildfires evacuation or...",1,Sin Locacion
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Sin Locacion


#### Analisis Sintáctico

In [13]:
#Parse del texto , donde cada token es una palabra

for i, row in train.iterrows():
  
    if(row["text"] and len(str(row["text"])) < 300):
        doc = nlp(str(row["text"]))
        
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []
        simbolos = []
        alfanumericos = []

        for token in doc:
            
            lemmas.append(token.lemma_)
            
            if not token.text.isalpha():
                alfanumericos.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
            if token.pos_ == "SYM":
                simbolos.append(token.lemma_)
         
        train.at[i, "adjetivos"] = " ".join(adjectives)
        train.at[i, "verbos"] = " ".join(verbs)
        train.at[i, "longitud"] = len(row["text"])
        
print('Completado')

Completado


In [14]:
#Me quedo con la cantidad que considere necesario
top_20_keywords=train['keyword'].value_counts().head(10).index
top_20_verbos = train['verbos'].astype('str').value_counts().head(20).index
top_20_adjetivos = train['adjetivos'].astype('str').value_counts().head(20).index
top_50_locaciones = train['Locacion'].astype('str').value_counts().head(50).index
top_10_hashtags = hashtags.head(10).index
top_10_arrobas = arrobas.head(10).index

In [15]:
#Agrego un booleano con respecto a si tienen los simbolos enunciados
train['Tiene_arroba']=train['text'].str.contains('@')
train['Tiene_hashtag']=train['text'].str.contains('#')
train['Tiene_signo_pregunta']=train['text'].str.contains('\?|¿')
train['Tiene_signo_exclamacion']=train['text'].str.contains('!|¡')
train['Tiene_link']=train['text'].str.contains('https')

#### Aplicación de los procesamientos

In [16]:
for key_top in top_20_keywords:
    train[key_top]=train['keyword'].str.contains(key_top)

In [17]:
for verbo_top in top_20_verbos:
    train[verbo_top]=train['verbos'].str.contains(verbo_top)

In [18]:
for adjetivos_top in top_20_adjetivos:
    train[adjetivos_top]=train['adjetivos'].str.contains(adjetivos_top)

In [19]:
for locacion_top in top_50_locaciones:
    train[locacion_top]=train['Locacion'].str.contains(locacion_top)

In [20]:
for hastag_top in top_10_hashtags:
    train[hastag_top]=train['text'].str.contains('#'+hastag_top)

In [21]:
for arroba_top in top_10_arrobas:
    train[arroba_top]=train['text'].str.contains('@'+arroba_top)

In [22]:
#Quito las columnas que no necesito y relleno los NaNs con False
train.drop(columns={'location','Locacion','keyword','text','adjetivos','verbos','longitud'},inplace=True)
train.fillna(False,inplace=True)
train.head()

Unnamed: 0,id,target,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,...,youtube,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher
0,1,1,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,5,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,6,1,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,7,1,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Set de Test

Lo mismo que se realizo para el set de entrenamiento se hace para el de test

In [23]:
#Cargo el csv
test=pd.read_csv('Data/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


### Generamos los Features

#### Analisis de Locacion

In [24]:
#Separo la locacion por las comas
separacion_coma_test=pd.concat([test, test['location'].str.split(',', expand=True)], axis=1)
separacion_coma_test.rename(columns={0:'Primero',1:'Segundo',2:'Tercero'},inplace=True)
separacion_coma_test.drop(columns={'location',3},inplace=True)
separacion_coma_test.head()

Unnamed: 0,id,keyword,text,Primero,Segundo,Tercero
0,0,,Just happened a terrible car crash,,,
1,2,,"Heard about #earthquake is different cities, s...",,,
2,3,,"there is a forest fire at spot pond, geese are...",,,
3,9,,Apocalypse lighting. #Spokane #wildfires,,,
4,11,,Typhoon Soudelor kills 28 in China and Taiwan,,,


In [25]:
#Me quedo con la primer parte de la separación ya que vimos que es la más importante
#Mapeo para cada ciudad, su pais
primera_ciudad_test=separacion_coma_test[['id','Primero']].copy()
primera_ciudad_test.rename(columns={'Primero':'city'},inplace=True)
ciudad_pais_test=pd.merge(primera_ciudad_test,ciudades,on='city',how='left')
ciudad_pais_test.drop_duplicates(subset="id",keep="first",inplace=True)

In [26]:
#Mapeo para cada abreviacion, su pais
primera_abreviacion_test=separacion_coma_test[['id','Primero']].copy()
primera_abreviacion_test.rename(columns={'Primero':'iso_a3'},inplace=True)
abreviacion_pais_test=pd.merge(primera_abreviacion_test,mundo,on='iso_a3',how='left')
abreviacion_pais_test.drop_duplicates(subset="id",keep="first",inplace=True)

In [27]:
#Verifico si en la primer coma esta bien la locacion
primera_pais_test=separacion_coma_test[['id','Primero']].copy()
primera_pais_test.rename(columns={'Primero':'name'},inplace=True)
pais_test=pd.merge(primera_pais_test,mundo,on='name',how='left')
pais_test.drop_duplicates(subset="id",keep="first",inplace=True)
pais_test.rename(columns={'iso_a3':'abreviacion'},inplace=True)
pais_test.drop(columns={'name'},inplace=True)

In [28]:
junto_test=pd.merge(abreviacion_pais_test,ciudad_pais_test,on='id',how='inner')
junto_test=pd.merge(junto_test,pais_test,on='id',how='inner')
junto_test['Locacion']=junto_test.apply(lambda x: MeQuedoConLocacion(x['name'],x['country'],x['abreviacion'],x['iso_a3']),axis=1)
junto_final_test=junto_test[['id','Locacion']]

#Lo Joineo con el set de entrenamiento
test=pd.merge(test,junto_final_test,on='id',how='inner')
test.head()

Unnamed: 0,id,keyword,location,text,Locacion
0,0,,,Just happened a terrible car crash,Sin Locacion
1,2,,,"Heard about #earthquake is different cities, s...",Sin Locacion
2,3,,,"there is a forest fire at spot pond, geese are...",Sin Locacion
3,9,,,Apocalypse lighting. #Spokane #wildfires,Sin Locacion
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Sin Locacion


#### Analisis Sintactico

In [29]:
#Parse del texto , donde cada token es una palabra

for i, row in test.iterrows():
  
    if(row["text"] and len(str(row["text"])) < 300):
        doc = nlp(str(row["text"]))
        
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []
        simbolos = []
        alfanumericos = []

        for token in doc:
            
            lemmas.append(token.lemma_)
            
            if not token.text.isalpha():
                alfanumericos.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
            if token.pos_ == "SYM":
                simbolos.append(token.lemma_)
         
        test.at[i, "adjetivos"] = " ".join(adjectives)
        test.at[i, "verbos"] = " ".join(verbs)
        test.at[i, "longitud"] = len(row["text"])
        
print('Completado')

Completado


In [30]:
#Agrego un booleano con respecto a si tienen los simbolos enunciados
test['Tiene_arroba']=test['text'].str.contains('@')
test['Tiene_hashtag']=test['text'].str.contains('#')
test['Tiene_signo_pregunta']=test['text'].str.contains('\?|¿')
test['Tiene_signo_exclamacion']=test['text'].str.contains('!|¡')
test['Tiene_link']=test['text'].str.contains('https')

#### Aplicación de los procesamientos

In [31]:
for key_top in top_20_keywords:
    test[key_top]=test['keyword'].str.contains(key_top)

In [32]:
for verbo_top in top_20_verbos:
    test[verbo_top]=test['verbos'].str.contains(verbo_top)

In [33]:
for adjetivos_top in top_20_adjetivos:
    test[adjetivos_top]=test['adjetivos'].str.contains(adjetivos_top)

In [34]:
for locacion_top in top_50_locaciones:
    test[locacion_top]=test['location'].str.contains(locacion_top)

In [35]:
for hastag_top in top_10_hashtags:
    test[hastag_top]=test['text'].str.contains('#'+hastag_top)

In [36]:
for arroba_top in top_10_arrobas:
    test[arroba_top]=test['text'].str.contains('@'+arroba_top)

In [37]:
test.drop(columns={'location','Locacion','keyword','text','adjetivos','verbos','longitud'},inplace=True)
test.fillna(False,inplace=True)
test.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,youtube,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,9,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,11,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Aplicacion de Algoritmos

### Dividimos el set de entrenamiento en Entrenamiento y Validación

In [38]:
datos=train.drop(columns={'target'})
precio=train['target']
datos.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,youtube,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher
0,1,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,6,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,7,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [39]:
#Separamos el set de entrenamiento en set de entrenamiento y de validacion
test_size = 0.33
random_state = 0
datos_train , datos_test, target_train, target_test = train_test_split(datos,precio,test_size = test_size,random_state = random_state)

### Perceptron

In [40]:
perceptron = Perceptron(eta0 = 0.1,random_state = random_state)

In [41]:
perceptron.fit(datos_train,target_train)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=0.1,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [42]:
prediccion_perceptron = perceptron.predict(datos_test)

print('accuracy: {0:.2f}%'.format(accuracy_score(target_test,prediccion_perceptron)*100))

accuracy: 58.42%


In [43]:
prediccion_test_perceptron = perceptron.predict(test)

In [44]:
test_perceptron=test.copy()
test_perceptron['target']=prediccion_test_perceptron
test_perceptron.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher,target
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
3,9,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
4,11,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0


In [45]:
entregable_perceptron=test_perceptron[['id','target']]
entregable_perceptron.set_index('id',inplace=True)
entregable_perceptron.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,0
3,0
9,0
11,0


In [46]:
entregable_perceptron.to_csv('Resultados/Perceptron_3.csv')

## Random Forest

In [47]:
clf = RandomForestClassifier(n_jobs=1,random_state=0)

In [48]:
clf.fit(datos_train,target_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [49]:
prediccion_RF = clf.predict(datos_test)

print('accuracy: {0:.2f}%'.format(accuracy_score(target_test,prediccion_RF)*100))

accuracy: 66.77%


In [50]:
prediccion_test_RF = clf.predict(test)

In [51]:
test_RF=test.copy()
test_RF['target']=prediccion_test_RF
test_RF.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher,target
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
2,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3,9,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4,11,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


In [52]:
entregable_RF=test_RF[['id','target']]
entregable_RF.set_index('id',inplace=True)
entregable_RF.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1


In [53]:
entregable_RF.to_csv('Resultados/RF_2.csv')

## Naive Bayes

In [54]:
gnb = GaussianNB()

In [55]:
gnb.fit(datos_train,target_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [56]:
prediccion_gnb = gnb.predict(datos_test)

print('accuracy: {0:.2f}%'.format(accuracy_score(target_test,prediccion_gnb)*100))

accuracy: 65.54%


In [57]:
prediccion_test_gnb = gnb.predict(test)

In [58]:
test_gnb=test.copy()
test_gnb['target']=prediccion_test_gnb
test_gnb.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher,target
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
3,9,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
4,11,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


In [59]:
entregable_gnb=test_gnb[['id','target']]
entregable_gnb.set_index('id',inplace=True)
entregable_gnb.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,0
3,0
9,0
11,1


In [60]:
entregable_gnb.to_csv('Resultados/NaiveBayes_2.csv')

## Logistic Regression

In [61]:
LogReg = LogisticRegression()

In [62]:
LogReg.fit(datos_train,target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
prediccion_LogReg = LogReg.predict(datos_test)

print('accuracy: {0:.2f}%'.format(accuracy_score(target_test,prediccion_LogReg)*100))

accuracy: 63.75%


In [64]:
prediccion_test_LogReg = LogReg.predict(test)

In [65]:
test_LogReg=test.copy()
test_LogReg['target']=prediccion_test_LogReg
test_LogReg.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher,target
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
2,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3,9,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4,11,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


In [66]:
entregable_LogReg=test_LogReg[['id','target']]
entregable_LogReg.set_index('id',inplace=True)
entregable_LogReg.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1


In [67]:
entregable_LogReg.to_csv('Resultados/LogReg_2.csv')

## SVM

In [68]:
SVM = svm.LinearSVC()

In [69]:
SVM.fit(datos_train,target_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [70]:
prediccion_SVM = SVM.predict(datos_test)

print('accuracy: {0:.2f}%'.format(accuracy_score(target_test,prediccion_SVM)*100))

accuracy: 50.46%


In [71]:
prediccion_test_SVM = SVM.predict(test)

In [72]:
test_SVM=test.copy()
test_SVM['target']=prediccion_test_SVM
test_SVM.head()

Unnamed: 0,id,Tiene_arroba,Tiene_hashtag,Tiene_signo_pregunta,Tiene_signo_exclamacion,Tiene_link,fatalities,deluge,armageddon,harm,...,arianagrande,potus,foxnews,change,usatoday,emmerdale,justinbieber,djicemoon,stretcher,target
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1,2,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
2,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
3,9,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4,11,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0


In [73]:
entregable_SVM=test_SVM[['id','target']]
entregable_SVM.set_index('id',inplace=True)
entregable_SVM.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,0
9,1
11,0


In [74]:
entregable_SVM.to_csv('Resultados/SVM_2.csv')