# <span style="color:#9c8f8f"> 75.06/95.58 Organización de Datos</span>
# <span style="color:#9c8f8f"> Análisis exploratorio: Real or Not? NLP with Disaster Tweets</span>

# <center>FEATURE ENGINEERING</center>
# <center>Decision trees</center>

In [1]:
# Cargo librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree

In [2]:
# Cargo train and test

dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

In [3]:
# Creo dataframe con target

target_train = train[["id","target"]]

In [4]:
DIR_CSV = "features/"

# I) Understanding decision trees

* In the decision tree chart, each internal node has a decision rule that splits the data.

* Gini referred as Gini ratio, which measures the impurity of the node. You can say a node is pure when all of its records belong to the same class, such nodes known as the leaf node.

* There is no need to normalize columns.

# II) Auxiliary Functions

In [5]:
dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

target_train = train[["id","target"]]
    
def get_clean_values(df):
    
    if("target" in list(df.columns)):
        del df["target"]
    df = df.merge(target_train, how="inner", on="id")   
    
    X = df.copy().fillna(0)
    y = df["target"].copy()
    
    if("target" in list(X.columns)):
        del X['target']
    if("id" in list(X.columns)):
        del X['id']
    if("Unnamed: 0" in list(X.columns)):
        del X["Unnamed: 0"]
    
    return X,y

In [6]:
def decision_regressor_feature_search(X,y,m):
    
    # Model creation
    
    clf = tree.DecisionTreeRegressor()     # DecisionTreeClassifier
    clf = clf.fit(X, y)
    
    # Feature importance
    feature_importances = clf.feature_importances_

    # get index of m most important features sorted
    m_index = np.argpartition(feature_importances, -m)[-m:]
    m_index = m_index[np.argsort(feature_importances[m_index])][::-1]
    
    # Get the m most important features of locations
    m_feature_importances = feature_importances[m_index]
    
    # Get m feature name
    m_feature_name = X.columns[m_index]
    
    return feature_importances, m_index, m_feature_importances, m_feature_name

In [7]:
# m cantidad de features con más relevancia

def decision_regressor_files(previous_dir, files, m_value):
    
    # contiene una lista por archivo con feature_importances, m_index, m_feature_importances, m_feature_name
    locations_others_info = []
    
    for i in range(len(files)):
        
        # cargo archivo
        df = pd.read_csv(previous_dir + files[i], low_memory=False)

        # Separo mi data
        X, y = get_clean_values(df)
        
        # Verifico m
        m = m_value if (m_value < len(X.columns)) else len(X.columns)

        # Aplico desition tree
        feature_importances, m_index, m_feature_importances, m_feature_name = decision_regressor_feature_search(X,y,m)
        locations_others_info.append([feature_importances, m_index, m_feature_importances, m_feature_name])

        # Imprimo informacion
        print_feature_search(m_feature_name, m_feature_importances, files[i])
    
    return locations_others_info

In [8]:
def plot_decision_tree(X,y,depth):
    
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf= clf.fit(X, y)

    plt.figure(1, figsize=(10,10))
    tree.plot_tree(clf, filled=True)
    
    plt.show()

In [9]:
def print_feature_search(feature, importance, file):
    
    print("{}\n".format(file.upper()))
    print("{: >20} {: >20}\n".format("feature","importance"))
    for i in range(len(feature)):
        print("{: >20}\t\t{: >20}".format(feature[i],importance[i]))
    print("\n\n")

# III) Desition trees

In [10]:
m = 30

## 1. Keywords

In [11]:
files_keywords = ["features_keywords_in_text_encoded.csv", "features_keywords_numerical.csv"]
DT_keywords = decision_regressor_files(DIR_CSV + 'train/' ,files_keywords, m)

FEATURES_KEYWORDS_IN_TEXT_ENCODED.CSV

             feature           importance

              K_fire		 0.05661728198438555
              K_bomb		 0.05199125994731499
             K_storm		 0.03685953808300767
           K_typhoon		 0.02589693991662052
            K_debris		0.024404951233193688
            K_derail		0.022208328518238923
            K_police		 0.02138398189071544
          K_wreckage		0.020289812809413856
             K_flood		0.020237075357704127
          K_outbreak		0.020022068256730207
             K_crash		0.019444017503033564
         K_oil spill		0.018819244115054035
            K_attack		0.017718994187464907
          K_disaster		0.016765026275420866
        K_earthquake		0.016631669122568037
           K_rescuer		 0.01658511859227331
          K_accident		0.015402460588731208
         K_terrorism		0.015361899029710298
          K_hellfire		0.013377229786418357
       K_mass murder		  0.0132641809047112
   K_bridge collapse		0.011366075265816006
         K_coll

## 2. Text 

In [12]:
files_text = [
    'features_tagged.csv',
    'features_tagged_smooth.csv',
    'features_hashtags.csv',
    'features_hashtags_smooth.csv',
    'features_text_numerical.csv',
    'features_arrobas_numerical.csv',
    'features_hashtags_numerical.csv'
]

DT_text = decision_regressor_files(DIR_CSV + 'train/', files_text, m)

FEATURES_TAGGED.CSV

             feature           importance

             youtube		0.010582048365137323
        arianagrande		0.004724382605659408
             foxnews		0.004310331898460457
              usagov		0.003708145382859913
           djicemoon		0.003000759840098269
       unsuckdcmetro		0.0029690335847644086
                  ap		0.002965865770857226
       mikeparractor		0.0025693168744599926
           sharethis		0.002233303543016643
             nbcnews		0.0022315109247883626
                 smh		0.0022297204640301586
       nasahurricane		0.0022279321572820903
       mnpdnashville		0.0022261460010894415
             invalid		0.002132933655997382
            usatoday		0.002064692404274339
             falphil		0.0017234574851221873
         lonewolffur		0.001715755468856011
      local_arsonist		0.001713913541353336
       worldnetdaily		0.001712074578326068
     realdonaldtrump		0.001710238573416528
       diamondkesawn		0.0017084055202827091
         barackobama		0.0

## 3. Locations

In [13]:
files_location = [
    'features_location_bow.csv',
    'features_location_tf_idf.csv',
    'features_location_numerical.csv',
    'features_location_in_text_numerical.csv',
    'features_location_in_text_numerical.csv'
]

DT_location = decision_regressor_files(DIR_CSV + 'train/' , files_location, 10)

FEATURES_LOCATION_BOW.CSV

             feature           importance

          country_in		 0.11136710508725352
          country_de		 0.04050403418691573
          country_ph		0.035706586953601537
          country_pk		 0.03306242103190026
          country_ie		 0.03245270967868139
          country_it		0.030874738475341924
          country_pt		 0.02814441567364848
          country_ne		 0.02649798251473739
          country_be		 0.02628551625937181
          country_pe		 0.02626522248112751



FEATURES_LOCATION_TF_IDF.CSV

             feature           importance

          country_ph		 0.05067904679247153
          country_pk		 0.04962403467896274
          country_pt		 0.04132740981083982
          country_kr		 0.03802966740102323
          country_pe		0.038000933236004036
          country_vn		 0.03797223162479723
          country_ke		0.035075571409536925
          country_gh		 0.03082696752081513
          country_ro		0.029314981021507616
          country_ne		 0.028685799105

In [14]:
files_location_places = ['features_city_encoded.csv',
    'features_city_in_text_encoded.csv',
    'features_country_encoded.csv',
    'features_country_in_text_encoded.csv',
    'features_county_encoded.csv',
    'features_county_in_text_encoded.csv',
    'features_state_encoded.csv',
    'features_state_in_text_encoded.csv',
    'features_continent_encoded.csv',
    'features_continent_in_text_encoded.csv']

DT_location_places = decision_regressor_files(DIR_CSV + 'train/' , files_location_places, m)

FEATURES_CITY_ENCODED.CSV

             feature           importance

         city_Mumbai		0.024130735826127757
     city_Washington		0.023943222328165075
           city_York		0.021547079685354395
     city_Sacramento		0.012510485874967316
        city_Karachi		0.009170525971629376
       city_Coventry		0.007721289501625104
         city_Oregon		0.007579538694094621
        city_Calgary		0.006908095728359463
        city_Jakarta		0.006899548099176785
         city_Dundee		0.006897543565439925
        city_Concord		0.006891929720974993
     city_Birmingham		0.006888352077922535
      city_Nashville		0.0064121196747666045
       city_Oklahoma		0.006396158832887941
        city_Orlando		0.005722531824631753
        city_Memphis		 0.00521137097713341
     city_Pittsburgh		 0.00513011209163457
      city_Edinburgh		0.005124510003628129
          city_Dubai		0.004864341833943509
 city_Corpus Christi		0.004630410725532537
          city_Selma		0.004627870382245118
      city_Bangalore		0.00

FEATURES_STATE_IN_TEXT_ENCODED.CSV

             feature           importance

    state_California		  0.4961074525511288
        state_Hawaii		 0.05711911206584368
       state_Alabama		0.056061722932534634
      state_Oklahoma		 0.05112350262978203
            state_AL		 0.03666658891180527
    state_Washington		 0.03411316507677217
      state_Maryland		0.026302154670002218
            state_NC		 0.02111266832404009
            state_CA		0.021089926793594813
       state_Georgia		  0.0210672219874664
      state_Colorado		  0.0182850946435405
       state_Vermont		 0.01589419527449522
      state_Illinois		0.015881313311553104
     state_Tennessee		0.011498056412031546
      state_Michigan		0.011482512196246361
       state_Arizona		0.007996262805361804
            state_NY		0.005324617225432336
      state_Arkansas		0.0053231728697533485
  state_South Dakota		0.005321729101681555
        state_Kansas		0.005320285920910415
         state_Maine		0.005318843327112242
state_North Carol

## 4. Links

In [15]:
files_links = ['features_links_numerical.csv', "features_domain_bow.csv", "features_domain_tf_idf.csv", "features_domain_smooth_tf_idf.csv"]
DT_links = decision_regressor_files(DIR_CSV + 'train/' , files_links, m)

FEATURES_LINKS_NUMERICAL.CSV

             feature           importance

          links_cant		  0.8897528194199258
   cant_failed_links		  0.1102471805800743
                id.1		                 0.0



FEATURES_DOMAIN_BOW.CSV

             feature           importance

         domain_ebay		  0.0512237262884467
          domain_bbc		0.028344309024258493
         domain_news		0.027502150165272007
      domain_youtube		0.018201937085652382
      domain_twitter		0.015858354576239048
          domain_cue		 0.01576063041811373
      domain_latimes		0.015562245113309775
  domain_feedsportal		0.012090302911725921
      domain_abcnews		 0.01200395749289994
          domain_gov		0.011884700269472852
        domain_wired		 0.01096572017935843
    domain_billboard		0.009887723677175764
       domain_amazon		0.009712076902145955
    domain_instagram		0.009198231569809102
    domain_careerarc		0.008828618322410569
       domain_tumblr		0.008671780796415746
           domain_co		0.008118500832497

## 5. Tags

In [16]:
files_tags = ["features_tags_bow.csv", "features_tags_tf_idf.csv"]
DT_tags = decision_regressor_files(DIR_CSV + 'train/' , files_tags, m)

FEATURES_TAGS_BOW.CSV

             feature           importance

            tag_fire		 0.02531172565654727
       tag_hiroshima		0.022525718673642364
         tag_suicide		0.022401693126487286
              tag_mh		0.018473834147994972
           tag_train		0.018353270198961207
        tag_wildfire		  0.0177667809728171
          tag_killed		 0.01629354837726697
         tag_typhoon		0.014789121951377274
           tag_storm		0.012337473646370202
         tag_migrant		0.012051594398332654
           tag_spill		0.011867982649404833
     tag_legionnaire		0.011732570408719679
          tag_police		0.010950678418018377
          tag_people		0.010354849863703731
        tag_accident		   0.009982311790658
           tag_japan		0.009449557657171533
      tag_earthquake		0.009233879415076382
         tag_israeli		0.008688675542163073
         tag_airport		 0.00800283959889477
        tag_collapse		0.007834305833797671
        tag_building		0.006836657940898787
            tag_love		0.0064544

# IV) Conclusion

In [17]:
list_of_files = [
    files_keywords,
    files_text,
    files_location,
    files_location_places,
    files_links,
    files_tags,
]

In [18]:
# Information of desition trees for m or less features

DT = [
    DT_keywords,
    DT_text,
    DT_location,
    DT_location_places,
    DT_links,
    DT_tags,
]

In [19]:
# Function that filters those features with lesser importance than min_importance

def filter_importance(DT, min_importance):
    feature_importances, m_index, m_feature_importances, m_feature_name = DT
    
    res = []
    for i in range(len(m_feature_importances)):
        if(m_feature_importances[i]>= min_importance):
            res.append(m_feature_name[i])
    return res

In [20]:
# Create a dataframe with all the m features for a csv filtered 

min_importance = 0.00001

df_total = []

for i in range(len(list_of_files)):
    
    for j in range(len(list_of_files[i])):
        
        df = pd.read_csv(DIR_CSV + 'train/'+ list_of_files[i][j] ,header=0)
        
        cols = list(filter_importance(DT[i][j], min_importance))
        
        cols.append("id")
                               
        df_total.append(df[cols])

In [21]:
for df in df_total:
    print(list(df.columns),'\n')

['K_fire', 'K_bomb', 'K_storm', 'K_typhoon', 'K_debris', 'K_derail', 'K_police', 'K_wreckage', 'K_flood', 'K_outbreak', 'K_crash', 'K_oil spill', 'K_attack', 'K_disaster', 'K_earthquake', 'K_rescuer', 'K_accident', 'K_terrorism', 'K_hellfire', 'K_mass murder', 'K_bridge collapse', 'K_collision', 'K_emergency', 'K_dead', 'K_evacuation', 'K_massacre', 'K_casualty', 'K_death', 'K_sinkhole', 'K_drought', 'id'] 

['keyword_frequency', 'keywords_quantity', 'text_contains_keyword', 'keywords_mean', 'keyword_is_hashtag', 'id'] 

['youtube', 'arianagrande', 'foxnews', 'usagov', 'djicemoon', 'unsuckdcmetro', 'ap', 'mikeparractor', 'sharethis', 'nbcnews', 'smh', 'nasahurricane', 'mnpdnashville', 'invalid', 'usatoday', 'falphil', 'lonewolffur', 'local_arsonist', 'worldnetdaily', 'realdonaldtrump', 'diamondkesawn', 'barackobama', 'potus', 'blutz10', 'cityofcalgary', 'wordpressdotcom', 'gofundme', 'tflbusalerts', 'newyorker', 'abc', 'id'] 

['youtube', 'arianagrande', 'foxnews', 'usagov', 'djicemoon

In [22]:
df_final = train[["id", "target"]]

In [23]:
# Mergeo todos los dataframe

for df in df_total:
    df_final = df_final.merge(df, on="id", how="left")

len(df_final.columns)

598

In [24]:
X,y = get_clean_values(df_final)
DT_final = decision_regressor_feature_search(X,y,m)

In [25]:
print_feature_search(DT_final[-2],DT_final[-1],"{} features finales más importantes".format(m))

30 FEATURES FINALES MÁS IMPORTANTES

             feature           importance

 0.08532302647289022		       has_continent
  0.0849331096856288		   promedio_len_word
 0.06448291844041716		   keyword_frequency
  0.0583634649299587		       keywords_mean
0.050955708400987426		         #caracteres
 0.04911678897261174		            #silabas
0.038178528812039696		         #capitalize
   0.031696020493479		          #stopwords
 0.03161784762783094		          links_cant
 0.02856401267391948		    #palabras_unicas
 0.02028319486354765		         #puntuacion
0.018743028166076155		         #mayusculas
0.018591845651323945		#caracteres_especiales
0.017411468209324644		           #palabras
 0.01503432347964608		       #tagged_users
0.012742944822664418		              K_bomb
0.011788543866478311		text_contains_keyword
0.010378452635272274		             K_storm
0.009619645924085773		           #hashtags
0.008548272212785807		  #puntuacion_binned
0.007071574064695315		            tag_fire
0.006978431884

In [26]:
# m = 30 y min_importance = 0.00001
df_final.to_csv('features/feature_selection/features_decition_tree1.csv', encoding='utf-8')