In [2]:
"""
Script to extract key words/ build feature matrix/ build weighted random forest model to predict browser family.
Author: Yue Wem
"""

import sys
import time
import pandas as pd
import numpy as np
import collections
import random
import operator
import re
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

# input parameters
#training_path_of_training_data = "datasets/data_coding_exercise.txt"
training_path_of_training_data = str(sys.argv[1])
#test_path_of_test_data = "datasets/test_data_coding_exercise.txt"
est_path_of_test_data = str(sys.argv[2])
#prediction_results_path_of_output_data = "datasets/prediction_results_path_of_output_data.txt"
prediction_results_path_of_output_data  = str(sys.argv[3])

# function
def find_str(search_str,target_str):
    """
    determine whether the target_str is in the search_str
    
    Parameters
    ----------
    search_str: str
        the str to be searched
    target_str: str
        the target str
    
    Returns
    -------
    bool:
        whether the target str is in the search  str
    """
    index = search_str.find(target_str)
    if index == -1:
        return False
    else:
        return True

def is_int(string):
    """
    determine whether a string is int
    
    Parameters
    ----------
    string: str

    Returns
    -------
    bool:
        whether the string can be transferred to integer
    """
    try: 
        int(string)
        return True
    except:
        return False
    

def find_pairs(agent_str,stop_words = ["Mozilla"]):
    """
    Find all the sequencial(word,num) pair in a string
    
    Parameters:
    -----------
    agent_str: str
        a str descripes user agent
    stop_words: list
        a list of words that will not appear in the final results
    Returns:
    -------
    dict:
        return the (word,list) pair as a dictionary, using list is to record 
        different num with the same word.
    """
    results = collections.defaultdict(list)
    word_lst = re.split("\W+",agent_str.replace("_","."))
    for index,word in enumerate(word_lst):
        if is_int(word):
            if not is_int(word_lst[index-1]) and word_lst[index-1] not in stop_words:
                results[ word_lst[index-1] ].append(word)
    
    return results


# extract key_words with high frequency within each family to build feature matrix for building feature matrix
def count_key_words(family_dict):
    """
    For each family,extract key words(with num right behind), and the occurences
    
    Parameters:
    -----------
    family_dict: dict
        a dictionary of which key is family(str),
        value is dataframe with three files of this family

    Returns:
    -------
    dict:
        return the (family,dict) pair as a dictionary, with each dictionary like 
        (key_word,count of occurences)
    """
    
    family_key_word_cnt_dict = collections.defaultdict(dict)
    for family in list(family_dict):
        # run over the dataframe of a certain family row by row
        family_key_word_cnt_dict[family] = dict()
        for i  in range(len(family_dict[family])):
            row = family_dict[family].iloc[i]
            agent_str = row.agents
            version = row.version

            word_num_pair = find_pairs(agent_str)

            # serach the key_word, version pair
            key_words = [key for key,values in word_num_pair.items() for value in values if value == version]

            for key_word in key_words:
                if key_word not in family_key_word_cnt_dict[family].keys():
                     family_key_word_cnt_dict[family][key_word] = 1
                else:
                    family_key_word_cnt_dict[family][key_word] += 1
    return family_key_word_cnt_dict

#from key words build feature matrix
def build_feature_matrix(key_words_lst,data):
    """
    build feature matrix based on key_words, with each column reprenting the exisitence of one key word
    
    Parameters
    ----------
    data: pd.DataFrame
        original data set format with three fields: agents, family, version
    key_words: list/set
        a list/set of key words that will be built to features
    
    Returns
    -------
    pd.DataFrame:
        a pandas dataframe that contains the features indicating whether the the agent str contains the key word
    """

    feature_matrix = data.copy()
    for word in key_words_lst:
        feature_matrix[word] = feature_matrix.agents.apply(lambda x: find_str(x,word)).astype("int")
    del feature_matrix["agents"]
    del  feature_matrix["version"]
    return feature_matrix

# split label  and feature for building random forest model
def feature_label_seperate(feature_matrix,encode_dict):
    """
    split feature matrix to features and label
    Parameters
    ----------
    feature_matrix: pd.DataFrame
        feature_matrix of which the 0th columnn represents label family
    encode_dict: dict
        dictionary used to encode the label to number

    Returns
    -------
    np.array,np.array:
        feature matrix and label as np.array
    
    """
    X = feature_matrix.iloc[:,1:].values
    
    y = feature_matrix.family
    y.replace(encode_dict, inplace = True)
    return X,y

#predict_version
def predict_version(agent_str,family,sorted_family_key_word_cnt_dict):
    """
    Given agent_str, family, predict the version of the agent
    
    Parameters:
    -----------
    agent_str: str
    family: str
    sorted_family_key_word_cnt_dict:dict
        key,value: (family,list), value is a list of key words,which is sorted from high to low by the occurences
        out of all the agents with the same family   
    Returns:
    -------
    str:
        the predicted version of the agent
    """
    candidates_words_lst = sorted_family_key_word_cnt_dict[family]
    words_num_pairs =  find_pairs(agent_str)
    try:    
        for candidate in candidates_words_lst:
            if candidate in set(words_num_pairs.keys()):
                return words_num_pairs[candidate][0]
        return "None"
    except:
        return "None"
                

#  Main function:  pipline  - main part
def predict_browser_and_version(training_path_of_training_data,test_path_of_test_data,prediction_results_path_of_output_data,verbose = True):
    """
    The is the pipeline function from extract key words, build RF model to predict browser and version.
    
    Parameters:
    -----------
    agent_str: str
    family: str
    sorted_family_key_word_cnt_dict:dict
        key,value: (family,list), value is a list of key words,which is sorted from high to low by the occurences
        out of all the agents with the same family   
    Returns:
    -------
    str:
        the predicted version of the agent
    """
        
    # read train and test data
    schema = ["agents","family","version"]
    data = pd.read_csv(training_path_of_training_data,delimiter = "\t",header= None, names = schema)
    test = pd.read_csv(test_path_of_test_data,delimiter = "\t",header= None, names = schema)

    # track time used
    tic = time.time()

    # remove missing datad
    data = data[data.version != "None"].reset_index(drop = True)
    #extract all the 28 family
    data_family = set(data.family.unique())
    # create a dictionary to put the sub-dataframe of each family
    family_dict = {}
    for family in data_family:
        family_dict[family] = data[data.family == family]

    # generate the sorted key word list for each family
    key_word_cnt_dict = count_key_words(family_dict)

    #sort the key_word by its occurences from to high to low
    sorted_key_word_cnt_dict = collections.defaultdict(list)
    sorted_key_word_dict = collections.defaultdict(list)
    for family in list(key_word_cnt_dict.keys()):
        sorted_key_word_cnt_dict[family] =[ (key,value) for key,value in sorted(key_word_cnt_dict[family].items(), 
                                                                                         key = lambda x: x[1], reverse = True)]
        sorted_key_word_dict[family] =[ key for key,value in  sorted_key_word_cnt_dict[family]]

    family_cnt = dict(data.groupby("family").count()["version"])
    key_words = set()
    for family in list(sorted_key_word_cnt_dict.keys()):
        filter_noise_word = set([ key for key,cnt in sorted_key_word_cnt_dict[family] if cnt >= 0.01 * family_cnt[family]])
        key_words = key_words.union(filter_noise_word) 


    toc1 = time.time()
    if verbose:
        print('It takes {} seconds to extract key words.'.format( toc1-tic))

    # build feature matrix
    feature_matrix = data.copy()
    feature_matrix =  build_feature_matrix(key_words.union({"FB","Mobile Safari"}),data)


    # feature, label split
    encode_dictionary = dict((family,i)  for i,family in enumerate(list(data_family)))
    decode_dictionary = dict((value,key) for key,value in encode_dictionary.items())
    X, y = feature_label_seperate(feature_matrix, encode_dictionary)

    model_rfc = RandomForestClassifier(max_depth = None, class_weight = "balanced",n_estimators = 100, min_samples_leaf = 5,random_state = 0)
    model_rfc.fit(X, y)
    y_pred  = model_rfc.predict(X)
    
    if verbose:
        print("Browser family prediction: \nThe accuracy of the model on training set is {}".format(model_rfc.score( X, y)))
        print("The claissification report is as  follows:")
        print(classification_report(y,y_pred))

    # predict on test data set:
    feature_matrix_test = build_feature_matrix(key_words.union({"FB","Mobile Safari"}),
                                               test)
    X_test, y_test = feature_label_seperate(feature_matrix_test, encode_dictionary)
    y_pred_test = model_rfc.predict(X_test)
    
    if verbose:
        print("Browser family prediction:\nThe accuracy of the model on test set is {}.".format(model_rfc.score( X_test, y_test)))
        print("The claissification report is as  follows:")
        print(classification_report(y_test,y_pred_test))

    toc2 = time.time()
    if verbose:
        print('It takes {} seconds to build RF model and  predict the browser family.'.format( toc2-toc1))

    # predict the results
    predicted_version_using_actual_family =  data.apply(lambda x: 
                                    predict_version(x.agents,x.family,sorted_key_word_dict),1)

    accuracy_using_actual_family = np.mean(predicted_version_using_actual_family == data["version"])
    
    if verbose:
        print("Version prediction: \nThe accuracy using actual browser family on the training  set is {}".format(accuracy_using_actual_family))


    #transfer predicted result from number to str
    y_pred_decode = pd.Series(y_pred).apply( lambda x: decode_dictionary[x] )
    data["predicted_family"] = y_pred_decode

    predicted_version_using_predicted_family =  data.apply(lambda x: 
                                    predict_version(x.agents,x["predicted_family"],sorted_key_word_dict),1)

    accuracy_using_predicted_family = np.mean(predicted_version_using_predicted_family == data["version"])
    
    if verbose:
        print("The accuracy using predicted browser family on the training  set is {}".format(accuracy_using_predicted_family))

    data["predicted_version"] = predicted_version_using_predicted_family

    # predict_test
    y_pred_test_decode = pd.Series(y_pred_test).apply( lambda x:decode_dictionary[x] )
    test["predicted_family"] = y_pred_test_decode
    y_pred_version_test_actual =  test.apply(lambda x: 
                                    predict_version(x.agents,x.family,sorted_key_word_dict),1)

    test_accuracy_actual = np.mean(y_pred_version_test_actual == test["version"].astype(str))
    
    if verbose:
        print("The accuracy using actual browser family on the test set is {}".format(test_accuracy_actual))

    y_pred_test_decode = pd.Series(y_pred_test).apply( lambda x:decode_dictionary[x] )
    test["predicted_family"] = y_pred_test_decode
    y_pred_version_test_pred =  test.apply(lambda x: 
                                    predict_version(x.agents,x.predicted_family,sorted_key_word_dict),1)

    test_accuracy_pred = np.mean(y_pred_version_test_actual == test["version"].astype(str))
    
    if verbose:
        print("The accuracy using predicted browser family on the test set is {}".format(test_accuracy_actual))
    
    test["predicted_version"] = y_pred_version_test_pred 

    toc = time.time()
    if  verbose:
        print('It takes {} seconds to predict the agent version.'.format( toc-toc2))
        print('In total, it takes {} seconds.'.format( toc-tic))

    test.to_csv(prediction_results_path_of_output_data,header = None, index=None, sep='\t')

predict_browser_and_version(training_path_of_training_data,test_path_of_test_data,prediction_results_path_of_output_data,verbose = True)

It takes 87.16698551177979 seconds to extract key words.
Browser family prediction: 
The accuracy of the model on training set is 0.9959290891990233
The claissification report is as  follows:
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       107
          1       1.00      1.00      1.00      5595
          2       1.00      1.00      1.00       911
          3       0.55      1.00      0.71       120
          4       0.99      1.00      1.00       305
          5       0.97      1.00      0.99      1281
          6       1.00      1.00      1.00    308656
          7       1.00      1.00      1.00     17426
          8       0.97      0.99      0.98       275
          9       0.68      1.00      0.81      1744
         10       1.00      0.84      0.91      2378
         11       1.00      1.00      1.00       116
         12       1.00      0.96      0.98       248
         13       1.00      1.00      1.00       224
         14 