In [1]:
import pandas as pd
import numpy as np
import mmh3
from sklearn.feature_extraction import FeatureHasher
from keras.utils import to_categorical
import time
import datetime
import progressbar

Using TensorFlow backend.


## Load data

In [2]:
col_names = pd.read_csv('/Users/andraz/DataScienceSchool/summer_school/ftrs.txt')
col_names = list(col_names.columns)
#col_names = ["index"] + col_names

In [3]:
def load_data(filename):
    data = pd.read_csv('/Users/andraz/DataScienceSchool/summer_school/' + filename)
    data.columns = col_names
    return data

## Select columns

In [4]:
def remove_cols(data, selected_feature_names):
    cols_to_remove = list(set(col_names) - set(selected_feature_names))
    return data.drop(cols_to_remove, axis=1)

## Encode yes/no

In [5]:
def yesno_to_int(attr):
    if attr == 'yes':
        return 1
    else:
        return 0

def yesno(column, feature_name):
    return column.apply (lambda row: yesno_to_int(row[feature_name]), axis=1)

def encode_yes_no(data, features_to_encode):
    print("*** Encoding yes/no ***")
    pbar = progressbar.ProgressBar(maxval=len(features_to_encode)).start()
    
    for feature in features_to_encode:
        data[feature] = yesno(data, feature)
        pbar.update(features_to_encode.index(feature) + 1)    
    return data

## Hash

In [6]:
def hash_col(column, feature_name):
    return column.apply (lambda row: mmh3.hash(str(row[feature_name]), signed=False) % 2**20, axis=1)

## Process data

In [2]:
selected_feature_names = ['click', 'account_id', 'adgroup_id', 'titlehash', 'urldomain', 'os2', 'devicemodel', 'browser', 'country', 'imo_adslots', 'imp_adunit', 'imp_layout', 'imp_seq', 'imp_num', 'impcount', 'nslashes', 'devicetype', 'conntype', 'fp_nclicks_dom_1h', 'fp_nseen_all_1h', 'fp_nclicks_all_1h', 'fp_nseen_dom_1h', 'metro', 'bid_floor', 'dailiy_budget', 'win_price', 'slideshow', 'pictures', 'gallery', 'listicle', 'quiz', 'video', 'has_zuid', 'has_clicked']
yesno_features_to_encode = ['slideshow', 'pictures', 'listicle', 'quiz', 'video', 'has_zuid', 'has_clicked', 'gallery']
features_to_hash = ['browser', 'os2', 'urldomain', 'account_id', 'titlehash', 'country', 'adgroup_id', 'devicemodel']
start_index = 1
end_index = 26

In [10]:
for fileindex in range(start_index, end_index):
    start = time.time()
    filename = "ctr_{:03d}.csv".format(fileindex)
    print("Processing {} [{}/{}]".format(filename, fileindex, end_index-1))
    
    # Load data
    print("Loading data ...")
    data = load_data(filename)
    print("Done")
    
    # Select columns
    data = remove_cols(data, selected_feature_names)
    print("Removed features")
    
    # encode yes/no
    data = encode_yes_no(data, yesno_features_to_encode)
    
    # Hash
    print('*** Hashing ***')
    pbar2 = progressbar.ProgressBar(maxval=len(features_to_hash)).start()
    
    for feature in features_to_hash:
        data[feature] = hash_col(data, feature)
        pbar2.update(features_to_hash.index(feature) + 1)
    
    # Save data
    data.to_csv('/Users/andraz/DataScienceSchool/summer_school/' + filename + '_processed.csv', index=False)
    print(filename, "done")
    end = time.time()
    elapsed = end - start
    print("Elapsed: ", str(datetime.timedelta(seconds=elapsed)))
    print("----------------------------")
    print()
    

Processing ctr_001.csv [1/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_001.csv done
Elapsed:  0:05:34.042295
----------------------------

Processing ctr_002.csv [2/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_002.csv done
Elapsed:  0:04:32.647845
----------------------------

Processing ctr_003.csv [3/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_003.csv done
Elapsed:  0:06:08.049908
----------------------------

Processing ctr_004.csv [4/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_004.csv done
Elapsed:  0:06:07.118241
----------------------------

Processing ctr_005.csv [5/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_005.csv done
Elapsed:  0:06:10.107860
----------------------------

Processing ctr_006.csv [6/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_006.csv done
Elapsed:  0:06:00.025540
----------------------------

Processing ctr_007.csv [7/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_007.csv done
Elapsed:  0:05:41.787830
----------------------------

Processing ctr_008.csv [8/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_008.csv done
Elapsed:  0:06:31.354910
----------------------------

Processing ctr_009.csv [9/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_009.csv done
Elapsed:  0:06:30.712692
----------------------------

Processing ctr_010.csv [10/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_010.csv done
Elapsed:  0:06:39.162799
----------------------------

Processing ctr_011.csv [11/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_011.csv done
Elapsed:  0:07:03.845140
----------------------------

Processing ctr_012.csv [12/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_012.csv done
Elapsed:  0:07:34.044458
----------------------------

Processing ctr_013.csv [13/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_013.csv done
Elapsed:  0:07:09.550672
----------------------------

Processing ctr_014.csv [14/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_014.csv done
Elapsed:  0:07:40.916862
----------------------------

Processing ctr_015.csv [15/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_015.csv done
Elapsed:  0:06:47.988171
----------------------------

Processing ctr_016.csv [16/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_016.csv done
Elapsed:  0:05:50.654484
----------------------------

Processing ctr_017.csv [17/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_017.csv done
Elapsed:  0:05:21.041876
----------------------------

Processing ctr_018.csv [18/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_018.csv done
Elapsed:  0:06:20.289765
----------------------------

Processing ctr_019.csv [19/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_019.csv done
Elapsed:  0:05:41.806459
----------------------------

Processing ctr_020.csv [20/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_020.csv done
Elapsed:  0:05:11.983247
----------------------------

Processing ctr_021.csv [21/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_021.csv done
Elapsed:  0:05:34.881354
----------------------------

Processing ctr_022.csv [22/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_022.csv done
Elapsed:  0:05:30.593956
----------------------------

Processing ctr_023.csv [23/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_023.csv done
Elapsed:  0:04:47.389536
----------------------------

Processing ctr_024.csv [24/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_024.csv done
Elapsed:  0:05:09.752571
----------------------------

Processing ctr_025.csv [25/25]
Loading data ...
Done


  0% |                                                                        |

Removed features
*** Encoding yes/no ***


  0% |                                                                        |

*** Hashing ***


100% |########################################################################|

ctr_025.csv done
Elapsed:  0:05:50.839121
----------------------------



In [9]:
data.loc[0]

click                     0.000000
account_id           819181.000000
adgroup_id           777402.000000
titlehash            110162.000000
imp_adunit                2.000000
imp_layout                7.000000
imp_seq                   0.000000
imp_num                   0.000000
impcount                  1.000000
urldomain            527445.000000
nslashes                  4.000000
slideshow                 0.000000
pictures                  0.000000
gallery                   0.000000
listicle                  0.000000
quiz                      0.000000
video                     0.000000
devicetype                2.000000
os2                   60742.000000
devicemodel          430763.000000
conntype                  0.000000
browser              831090.000000
has_zuid                  1.000000
has_clicked               0.000000
fp_nclicks_dom_1h         0.000000
fp_nseen_all_1h           9.000000
fp_nclicks_all_1h         0.000000
fp_nseen_dom_1h           0.000000
country             

In [3]:
len(selected_feature_names)

34