# <span style="color:#9c8f8f"> 75.06/95.58 Organización de Datos</span>
# <span style="color:#9c8f8f"> Análisis exploratorio: Real or Not? NLP with Disaster Tweets</span>

# <center>FEATURE ENGINEERING</center>
# <center>Random Forest</center>

In [1]:
# Cargo librerias

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split

# I) Auxiliary Functions

In [160]:
#Analisis de features y la importancia de c/u
def get_eficiencia_features_de (df, cantidad_arboles, profundidad_cada_arbol):
    # "desde_col" toma la columna desde la cual se van a evaluar los features
    # "hasta_col" toma la columna hasta la cual se van a evaluar los features
    # "on_col" toma la unica columna para la cual se van a calcular los features que van entre (desde_col,hasta_col)
    vector_aux = []
    on_col = 1
    desde_col = on_col + 1
    hasta_col = -1 
    #X,y = df.iloc[:,1:7] , df.iloc[:,1]
    X,y = df.drop(columns=["target","target_x","target_y","id"]) , df.target
    X_train, X_test, y_train, y_test = train_test_split (X,y, test_size=0.2, random_state = 123)
    rf_model = RandomForestClassifier(random_state = 1, n_estimators = cantidad_arboles, max_depth = profundidad_cada_arbol)
    rf_model.fit(X_train,y_train)
    predicciones = rf_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test,predicciones)) 
    feature_importances = rf_model.feature_importances_
    feature_estimators = rf_model.estimators_[0]
    vector_aux.append(X_train)
    vector_aux.append(rmse)
    vector_aux.append(feature_importances)
    vector_aux.append(feature_estimators)
    return vector_aux

In [3]:
#Plot de feature importance con RF
def bar_plot_feature_importance (X_train, feature_importance):
    plt.bar(X_train.columns, feature_importance)
    plt.xlabel ('features')
    plt.ylabel ('importancia')
    plt.title ('importancia features con RF')
    plt.rcParams["figure.figsize"] = (6,4)
    plt.xticks(rotation='vertical')
    plt.show()

In [4]:
#Plot de grafica de estimadores y diagrama de arbol
def bar_plot_feature_estimators (X_train, feature_estimators):
    plt.bar(X_train.columns, feature_estimators.feature_importances_)
    plt.xlabel('features')
    plt.ylabel('importancia')
    plt.title('importancia features del arbol')
    plt.xticks(rotation='vertical')
    plt.show
    fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize=(10,10), dpi = 200)
    tree.plot_tree(feature_estimators, feature_names = X_train.columns, filled = True)

In [118]:
#Standariza a cierto formato los csv input
def csv_to_df_standarized(doc_csv):
    df = pd.read_csv(doc_csv)
    if df.columns[0] == 'Unnamed: 0':
        df = df.drop(df.columns[0],axis=1)
    return df

In [116]:
#Toma todos los csv con features numericos y los procesa
def analisis_de_todos_los_features_numericos(docs):
    for doc in docs:
        df_actual = csv_to_df_standarized(doc)
        vectorcito_actual = get_eficiencia_features_de(df_actual,3,6)
        print('RMSE: %f' % vectorcito_actual[1])
        puntaje_ordenado_descendente = np.sort(vectorcito_actual[2])[::-1]
        print(puntaje_ordenado_descendente[:20])
        bar_plot_feature_importance(vectorcito_actual[0],puntaje_ordenado_descendente)
        #bar_plot_feature_importance(vectorcito_actual[0],vectorcito_actual[2])
        bar_plot_feature_estimators(vectorcito_actual[0],vectorcito_actual[3])

In [7]:
#Guarda los features mas importantes y los valores de feature_importance
def feature_importance_chart(X_train,feature_importance):
    featureImpList= []

    for feat, importance in zip(X_train.columns, feature_importance):  
        temp = [feat, importance*100]
        featureImpList.append(temp)

    fT_df = pd.DataFrame(featureImpList, columns = ['Feature', 'Importance'])
    final = fT_df.sort_values('Importance',ascending = False)
    return final

In [8]:
#Devuelve el numero de columna que representa el feature mas importante - de mayor a menor
def col_number_by_feature_importance_descending(feature_importance):
    important_features_dict = {}
    for x,i in enumerate(feature_importance):
        important_features_dict[x]=i


    important_features_list = sorted(important_features_dict,
                                     key=important_features_dict.get,
                                     reverse=True)
    resultado = ('Most important features: %s' %important_features_list)
    return resultado


In [9]:
def csv_final_test_and_train(importance_features_chart,cant_features,test_set, train_set):
    importance_features_chart = importance_features_chart.iloc[0:cant_features,]
    lista_columnas_elegidas = importance_features_chart['Feature'].to_list()
    lista_columnas_elegidas.append("id")
    termino = ['target']
    lista_columnas_elegidas_train = (lista_columnas_elegidas + termino)
    
    final_test = test_set[test_set.columns.intersection(lista_columnas_elegidas)]
    final_train = train_set[train_set.columns.intersection(lista_columnas_elegidas_train)]
    return (final_train, final_test)

In [12]:
def standarize_multiple_dataframes(lista_dataframes):
    lista_df_standarized = []
    for df in lista_dataframes:
        df_standarized = csv_to_df_standarized(df)
        lista_df_standarized.append(df_standarized)
    return lista_df_standarized

# II) Random Forest

## a. TRAIN SET

In [24]:
lista_train_sets = ['train/features_city_encoded.csv', 'train/features_city_in_text_encoded.csv', 'train/features_continent_encoded.csv',\
                   'train/features_continent_in_text_encoded.csv','train/features_country_encoded.csv','train/features_country_in_text_encoded.csv',\
                   'train/features_domain_tf_idf.csv','train/features_hashtags_numerical.csv','train/features_links_numerical.csv','train/features_location_in_text_numerical.csv',\
                   'train/features_location_numerical.csv','train/features_state_encoded.csv','train/features_state_in_text_encoded.csv','train/features_tags_tf_idf.csv',\
                   'train/keywords_categorical_features_encoded_train - Copy.csv','train/keywords_numerical_features - Copy.csv','train/text_general_numerical_features_train - Copy.csv']

In [119]:
df_standarized = standarize_multiple_dataframes(lista_train_sets)

In [122]:
#8,9 a la mierda
df_standarized[9]

Unnamed: 0,id,target,has_country,has_city,has_county,has_state,has_continent
0,1,,0,0,0,0,0
1,4,,1,0,0,0,1
2,5,,0,0,0,0,0
3,6,,1,0,0,1,1
4,7,,1,0,0,1,1
...,...,...,...,...,...,...,...
7608,10869,,0,0,0,0,0
7609,10870,,1,0,0,1,1
7610,10871,,1,0,0,1,1
7611,10872,,1,0,0,0,1


In [123]:
from functools import reduce

train_set_final = reduce(lambda x,y: pd.merge(x,y, on=['id'], how = 'left'), df_standarized)


In [129]:
train_set_final = train_set_final.fillna(0)
train_set_final

Unnamed: 0,id,city_Abuja,city_Adelaide,city_Alameda,city_Albany,city_Albuquerque,city_Alexandria,city_Alvin,city_Amarillo,city_Ames,...,#caracteres_especiales,#palabras_binned,#palabras_unicas_binned,#caracteres_binned,#stopwords_binned,#puntuacion_binned,#capitalize_binned,#mayusculas_binned,#silabas_binned,#caracteres_especiales_binned
0,1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,2,0,1,0
1,4,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,3,3,3,4,2,1,1,0,3,1
3,6,0,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,1,0
4,7,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
7609,10870,0,0,0,0,0,0,0,0,0,...,2,2,2,3,2,0,1,0,2,0
7610,10871,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,2,1,0,0,2
7611,10872,0,0,0,0,0,0,0,0,0,...,5,3,3,4,1,2,1,0,3,2


In [171]:
vector_train_set = get_eficiencia_features_de(train_set_final,175,200)

In [172]:
train_set_feature_importance = feature_importance_chart(vector_train_set[0],vector_train_set[2])
train_set_feature_importance.describe()

Unnamed: 0,Importance
count,2724.0
mean,0.036711
std,0.176376
min,0.0
25%,0.000531
50%,0.002713
75%,0.022452
max,4.207249


In [177]:
train_set_feature_importance[train_set_feature_importance.Importance >= 0.1]

Unnamed: 0,Feature,Importance
2713,promedio_len_word,4.207249
2707,#caracteres,2.710948
2712,#silabas,2.457792
2704,keyword_frequency,2.401377
2702,keywords_mean,2.234268
...,...,...
2429,tag_severe,0.100879
1248,domain_facebook,0.100678
2314,tag_injured,0.100496
277,city_London_x,0.100211


In [180]:
lista_test_sets = ['test/features_city_encoded.csv', 'test/features_city_in_text_encoded.csv', 'test/features_continent_encoded.csv',\
                   'test/features_continent_in_text_encoded.csv','test/features_country_encoded.csv','test/features_country_in_text_encoded.csv',\
                   'test/features_domain_tf_idf.csv','test/features_hashtags_numerical.csv','test/features_links_numerical.csv','test/features_location_in_text_numerical.csv',\
                   'test/features_location_numerical.csv','test/features_state_encoded.csv','test/features_state_in_text_encoded.csv','test/features_tags_tf_idf.csv',\
                   'test/test_categorical_keywords_encoded - Copy.csv','test/keywords_numerical_features_test - Copy.csv','test/text_general_numerical_features_test - Copy.csv']

test_set = standarize_multiple_dataframes(lista_test_sets)

In [181]:
test_set_final = reduce(lambda x,y: pd.merge(x,y, on=['id'], how = 'left'), test_set)
test_set_final

Unnamed: 0,id,city_Abuja,city_Adelaide,city_Alameda,city_Albany,city_Albuquerque,city_Alexandria,city_Alvin,city_Amarillo,city_Ames,...,#caracteres_especiales,#palabras_binned,#palabras_unicas_binned,#caracteres_binned,#stopwords_binned,#puntuacion_binned,#capitalize_binned,#mayusculas_binned,#silabas_binned,#caracteres_especiales_binned
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,3,0,0,1,0,1,0,0,1,1
2,3,0,0,0,0,0,0,0,0,0,...,2,3,3,3,2,0,0,0,1,0
3,9,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,1,1,0,0,1
4,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3259,10865,0,0,0,0,0,0,0,0,0,...,5,3,3,4,1,2,2,0,3,2
3260,10868,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3261,10874,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0


In [182]:
final_train,final_test = csv_final_test_and_train(train_set_feature_importance,174,train_set_final,test_set_final)

In [183]:
final_train

Unnamed: 0,id,city_London_x,continent_AF_x,continent_AS_x,continent_EU_x,continent_NA_x,continent_OC_x,continent_AF_y,continent_AS_y,continent_NA_y,...,promedio_len_word,#caracteres_especiales,#palabras_binned,#palabras_unicas_binned,#caracteres_binned,#stopwords_binned,#puntuacion_binned,#capitalize_binned,#silabas_binned,#caracteres_especiales_binned
0,0,0,0,0,0,0,0,0,0,0,...,4.833333,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,6.222222,3,0,0,1,0,1,0,1,1
2,3,0,0,0,0,0,0,0,0,0,...,4.105263,2,3,3,3,2,0,0,1,0
3,9,0,0,0,0,0,0,0,0,0,...,9.250000,3,0,0,0,0,1,1,0,1
4,11,0,0,0,0,0,0,0,1,0,...,4.750000,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0,0,0,0,0,0,0,0,0,...,6.428571,0,0,0,0,0,0,0,0,0
3259,10865,0,0,0,0,0,0,0,0,0,...,5.086957,5,3,3,4,1,2,2,3,2
3260,10868,0,0,0,0,0,0,0,0,0,...,5.600000,0,0,0,0,0,0,1,0,0
3261,10874,0,0,0,0,0,0,0,0,0,...,6.166667,2,0,0,0,0,0,1,0,0


In [184]:
final_test

Unnamed: 0,id,city_London_x,continent_AF_x,continent_AS_x,continent_EU_x,continent_NA_x,continent_OC_x,continent_AF_y,continent_AS_y,continent_NA_y,...,promedio_len_word,#caracteres_especiales,#palabras_binned,#palabras_unicas_binned,#caracteres_binned,#stopwords_binned,#puntuacion_binned,#capitalize_binned,#silabas_binned,#caracteres_especiales_binned
0,1,0,0,0,0,0,0,0,0,0,...,4.384615,1,1,1,1,1,0,2,1,0
1,4,0,0,0,0,0,0,0,0,1,...,4.571429,1,0,0,0,0,0,2,0,0
2,5,0,0,0,0,0,0,0,0,0,...,5.090909,3,3,3,4,2,1,1,3,1
3,6,0,0,0,0,0,0,0,0,1,...,7.125000,2,0,0,1,0,0,0,1,0
4,7,0,0,0,0,0,0,0,0,1,...,4.500000,2,2,2,2,2,0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0,0,0,0,0,0,0,0,0,...,5.100000,0,0,1,1,0,0,0,1,0
7609,10870,0,0,0,0,0,0,0,0,1,...,4.555556,2,2,2,3,2,0,1,2,0
7610,10871,0,0,0,0,0,0,0,0,1,...,5.142857,6,0,0,0,0,2,1,0,2
7611,10872,0,0,0,0,0,0,0,0,0,...,6.263158,5,3,3,4,1,2,1,3,2


In [186]:
final_train.to_csv("train_feature_selection_RF.csv")
final_test.to_csv("test_feature_selection_RF.csv")