# Description
* DataLoad: loads individual raw train/test tables
* Preprocessing: all table preprocessing, final merge, returns train/test_dataset
* Train: loads train/test_dataset, trains model, train_test_split, data, evaluates using f1-score, returns model
* Predict: loads model, train_dataset and test_dataset, compares fills missing columns with zeros, .predict, returns predictions_array
* Submit: loads predictions_array, generates submission_df, saves to csv in submissions folder
* Pipeline: Wrapper for whole process

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datetime import datetime

In [2]:
TODAY = datetime.today().strftime('%Y-%m-%d')
TARGET = 'upgrade'
ID = 'line_id'
ROOT_PATH = 's3://tf-trachack-notebooks/'+'9417-brhuang-unsw'+'/jupyter/jovyan/'
DATA_PATH = 's3://tf-trachack-data/212/'
SUBMISSION_PATH = ROOT_PATH+f"submission/{TODAY}.csv"

## DataLoad Class

In [3]:
class DataLoad:
    def get_upgrades(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/upgrades.csv")

    def get_customer_info(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/customer_info.csv")

    def get_redemptions(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/redemptions.csv")
    
    def get_deactivations(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/deactivations.csv")
    
    def get_reactivations(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/reactivations.csv")
    
    def get_suspensions(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/suspensions.csv")
    
    def get_phone_info(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/phone_info.csv")

    def get_network_usage_domestic(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/network_usage_domestic.csv")
        
    def get_lrp_points(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/lrp_points.csv")

    def get_lrp_enrollment(training:bool) -> pd.DataFrame:
        training_string = "dev" if training else "eval"
        return pd.read_csv(DATA_PATH+f"data/{training_string}/lrp_enrollment.csv")

## Helper Class

In [18]:
class Helper:
    def drop_infrequent_categories(df: pd.DataFrame, cols: list, thresh: int = 10000, replace_with: str = "other"):
            for col in cols:
                value_counts = df[col].value_counts()
                to_remove = value_counts[value_counts <= thresh].index
                df[col].replace(to_remove, replace_with, inplace=True)
                
    def create_var_sum(df, save=None):
        """
        Input: 
          df = data frame to summarise
          save = string name of dataframe you wish to save
        Returns:
          A dataframe that breaks down all variables in a dataframe by unique, missing, datatype, and frequency,
          Option to save the file
        """
        result = pd.DataFrame({
            'Col':          df.columns,
            'Unique':       [df[col].nunique() for col in df.columns],
            'Missing':      [len(df[col]) - df[col].count() for col in df.columns],
            'Datatype':     list(df.dtypes),
            'Most Freq':    [list(df[col].value_counts().index[:5]) for col in df.columns],
            'Least Freq':   [list(df[col].value_counts().index[-5:]) for col in df.columns]})

        if save != None:
            result.to_excel(f"{WRKDIR}/Data/new/'VariableSummary{save}.xls", index = False)

        return result
    
    def get_nonmissing_cols_and_warn(df: pd.DataFrame, cols, verbose=False):
        missing_cols = [x for x in cols if x not in df.columns]
        if len(missing_cols) > 0:
            if verbose:
                print("The following columns are not in the source data:\n", missing_cols)
            cols = [x for x in cols if x not in missing_cols]
        return cols

## Preprocessing Class

In [6]:
class Preprocessing:
    def preprocess_customer_status(training:bool) -> pd.DataFrame: ## works!
        # loading the data
        upgrades_df = DataLoad.get_upgrades(training=training)
        customer_info_df = DataLoad.get_customer_info(training=training)
        redemptions_df = DataLoad.get_redemptions(training=training)
        deactivations_df = DataLoad.get_deactivations(training=training)
        reactivations_df = DataLoad.get_reactivations(training=training)
        suspensions_df = DataLoad.get_suspensions(training=training)
        
        # cleaning upgrades
        if training: 
            upgrades_df.upgrade.replace(('yes', 'no'), (1, 0), inplace=True)
        upgrades_df.drop(['date_observed'],axis=1, inplace=True)
        
        # cleaning customer_info ## WORKS!
        customer_info_df.replace('[NULL]',np.NaN, inplace=True)
        customer_info_df.drop(['plan_subtype'], axis = 1, inplace=True)
        customer_info_df['plan_name'].fillna("not_given", inplace=True)
        customer_info_df.drop(['redemption_date', 'first_activation_date'], axis=1,inplace=True)
        ci_cat_features_df = customer_info_df.loc[:,['line_id','carrier', 'plan_name']]
        ci_cat_features_df = pd.get_dummies(ci_cat_features_df, columns=['carrier', 'plan_name'])
        clean_customer_info_df = pd.merge(customer_info_df, ci_cat_features_df, on='line_id', how='inner')
        clean_customer_info_df.drop(['plan_name', 'carrier'], axis = 1, inplace=True)
        
        # cleaning redemptions ## WORKS!
        redemptions_df.drop(['channel','redemption_type','revenue_type', 'gross_revenue'], axis=1, inplace=True)
        redemption_count_per_line_id = redemptions_df.groupby(['line_id'])['redemption_date'].count()
        redemptions_df = pd.merge(redemptions_df,redemption_count_per_line_id, on='line_id',how='inner')
        redemptions_df.drop(['redemption_date_x'], axis=1, inplace=True)
        redemptions_df.rename(columns = {'redemption_date_y':'total_redemptions'}, inplace=True)
        clean_redemptions_df = redemptions_df.drop_duplicates()
        
        # cleaning suspensions ## WORKS!
        suspensions_cat_df = suspensions_df.groupby('line_id')
        suspensions_total_df = suspensions_cat_df.count()
        rename_dict = {'suspension_start_date':'total_suspensions',
                       'suspension_end_date':'total_unsuspensions',
                      }
        clean_suspensions_df = suspensions_total_df.rename(columns=rename_dict)
        
        # clean deactivations ## works !
        deactivations_total_df = deactivations_df[['line_id', 'deactivation_date']].groupby('line_id')
        deactivations_total_df = deactivations_total_df.count()
        clean_deactivations_df = deactivations_total_df.rename(columns={'deactivation_date':'total_deactivations'})
        
        # clean reactivations ## works !
        reactivations_total_df = reactivations_df[['line_id', 'reactivation_date']].groupby('line_id')
        reactivations_total_df = reactivations_total_df.count()
        reactivations_total_df.rename(columns={'reactivation_date':'total_reactivations'}, inplace=True)
        
        reactivations_dropping = reactivations_df.copy(deep = True)
        Helper.drop_infrequent_categories(reactivations_dropping, ['reactivation_channel'], thresh = 2000)
        reactivations_cat_features_df = reactivations_dropping.loc[:,['line_id','reactivation_channel']]
        reactivations_cat_features_df = pd.get_dummies(reactivations_cat_features_df, columns=['reactivation_channel'])
        reactivations_cat_features_df = reactivations_cat_features_df.groupby('line_id')
        reactivations_cat_features_df = reactivations_cat_features_df.sum()
        
        clean_reactivations_df = pd.merge(reactivations_total_df, reactivations_cat_features_df,on='line_id',how='inner')
        
        # merging ## works!
        final = clean_customer_info_df
        
        if training: 
            final = pd.merge(upgrades_df,final,on='line_id',how='inner')
        
        final = pd.merge(final, clean_redemptions_df,on='line_id',how='left')
        final = pd.merge(final, clean_suspensions_df,on='line_id',how='left')
        final = pd.merge(final, clean_deactivations_df,on='line_id',how='left')
        final = pd.merge(final, clean_reactivations_df,on='line_id',how='left')
        final.fillna(0, inplace=True)
        return final
        
    def preprocess_lrp(training:bool) -> pd.DataFrame:
        lrp_points = DataLoad.get_lrp_points(training=training)
        lrp_enrollment = DataLoad.get_lrp_enrollment(training=training)
        upgrades = DataLoad.get_upgrades(training=training)
        
        if training: 
            upgrades.upgrade.replace(('yes', 'no'), (1, 0), inplace=True)
        
        lrpp = lrp_points.copy()
        lrp_combine = lrpp.merge(lrp_enrollment, on='line_id', how='outer')
        lrp_combine.drop('status', axis=1, inplace=True)
        lrp_combine['quantity'].replace({np.nan:0},inplace=True)
        lrp_combine['total_quantity'].replace({np.nan:0},inplace=True)
        now = datetime.now()
        lrp_combine['length_of_membership'] = now - pd.to_datetime(lrp_combine['lrp_enrollment_date'], format='%Y-%m-%d')
        lrp_combine['length_of_membership'] = lrp_combine['length_of_membership'].apply(lambda x: x.days)
        lrp_combine['update_date'] = lrp_combine.apply(lambda x: x['lrp_enrollment_date'] if pd.isna(x['update_date']) else x['update_date'], axis=1)
        lrp_combine['last_interaction'] = (now - pd.to_datetime(lrp_combine['update_date'], format='%Y-%m-%d')).apply(lambda x: x.days)
        lrp_combine.drop(['update_date','lrp_enrollment_date'], axis=1, inplace=True)
        
        # joining with upgrade
        up_lrp = upgrades.merge(lrp_combine, on='line_id', how='left')
        up_lrp.lrp_enrolled.replace(('Y', np.nan), (1, 0), inplace=True)
        up_lrp['total_quantity'] = up_lrp.apply(lambda x: 0 if x['lrp_enrolled'] == 0 else x['total_quantity'], axis=1)
        up_lrp['quantity'] = up_lrp.apply(lambda x: 0 if x['lrp_enrolled'] == 0 else x['quantity'], axis=1)
        up_lrp['length_of_membership'] = up_lrp.apply(lambda x: 0 if x['lrp_enrolled'] == 0 else x['length_of_membership'], axis=1)
        up_lrp['last_interaction'] = up_lrp.apply(lambda x: 0 if x['lrp_enrolled'] == 0 else x['last_interaction'], axis=1)
        
        # removing dups
        up_lrp = up_lrp[up_lrp.groupby('line_id')['length_of_membership'].transform('min') == up_lrp['length_of_membership']]
        up_lrp.drop('date_observed', axis=1, inplace=True)
        return up_lrp
        
    def preprocess_phone_info(training:bool) -> pd.DataFrame:
        phone_info = DataLoad.get_phone_info(training=training)
        upgrades = DataLoad.get_upgrades(training=training)
        
        # cleaning upgrades
        if training: 
            upgrades.upgrade.replace(('yes', 'no'), (1, 0), inplace=True)
        upgrades.drop(['date_observed'],axis=1, inplace=True)
        
        # cleaning phone_info
            # numerical fillnas
        num_cols = ['expandable_storage', 'lte', 'lte_advanced', 'lte_category', 
                    'touch_screen', 'wi_fi', 'year_released']
        for col in num_cols:
            phone_info[col].fillna(-1.0, inplace=True)
        
        cat_cols = ['cpu_cores', 'manufacturer', 'os_family', 'os_name', 'os_vendor', 'os_version', 
                    'sim_size', 'total_ram', 'gsma_device_type', 'gsma_model_name', 'gsma_operating_system', 
                    'internal_storage_capacity']
        for col in cat_cols:
            phone_info[col].fillna("none", inplace=True)
        
        # should also drop gsma_operating_system
        drop_cols = ['lte_advanced','wi_fi','touch_screen','gsma_model_name','os_family','os_vendor', 
                     'year_released', 'lte', 'sim_size']
        phone_info.drop(drop_cols, axis =1, inplace = True)
        
        Helper.drop_infrequent_categories(phone_info, ['lte_category'], thresh = 2000)
        Helper.drop_infrequent_categories(phone_info, ['manufacturer'], thresh = 900)
        Helper.drop_infrequent_categories(phone_info, ['os_name'], thresh = 10000)
        Helper.drop_infrequent_categories(phone_info, ['os_version'], thresh = 1000)
        Helper.drop_infrequent_categories(phone_info, ['total_ram'], thresh = 2000)
        Helper.drop_infrequent_categories(phone_info, ['cpu_cores'], thresh = 2000)
        Helper.drop_infrequent_categories(phone_info, ['gsma_device_type'], thresh = 11000)
        phone_info['gsma_operating_system'].replace(['Not Known', 'NONE'], ['none', 'none'], inplace = True)
        Helper.drop_infrequent_categories(phone_info, ['gsma_operating_system'], thresh = 12000)
        Helper.drop_infrequent_categories(phone_info, ['internal_storage_capacity'], thresh = 6000)
        
        # one hot encode data
#         cat_features = list(phone_info.columns).remove(ID)
        pi_cat_features = pd.get_dummies(phone_info, columns=['cpu_cores', 'expandable_storage','gsma_device_type','gsma_operating_system','internal_storage_capacity','lte_category','manufacturer','os_name','os_version','total_ram'])
        clean_phone_info = pd.merge(upgrades,pi_cat_features,on='line_id',how='inner')
        
        return clean_phone_info
    
    def preprocess_network_usage_domestic(training:bool) -> pd.DataFrame:
        network_usage_domestic = DataLoad.get_network_usage_domestic(training=training)
        upgrades = DataLoad.get_upgrades(training=training)
        last_date = network_usage_domestic[['line_id','date']].groupby(['line_id']).max()
        first_date = network_usage_domestic[['line_id','date']].groupby(['line_id']).min()
        network_mean = network_usage_domestic.groupby(['line_id']).mean()
        network_mean['voice_min_total'] = network_mean.apply(lambda row: row.voice_min_in + row.voice_min_out, axis=1)
        network_mean['mms_total'] = network_mean.apply(lambda row: row.mms_in + row.mms_out, axis=1)
        network_mean['sms_total'] = network_mean.apply(lambda row: row.sms_in + row.sms_out, axis=1)
        network_mean['voice_count_out'] = network_mean.apply(lambda row: row.voice_count_total - row.voice_count_in, axis=1)
        network_mean = network_mean.merge(upgrades, left_on='line_id', right_on='line_id')
        network_mean = network_mean.merge(first_date, left_on='line_id', right_on='line_id')
        network_mean = network_mean.merge(last_date, left_on='line_id', right_on='line_id')
        now = datetime.now()
        network_mean['length_of_connection'] = now - pd.to_datetime(network_mean['date_x'], format='%Y-%m-%d')
        network_mean['length_of_connection'] = network_mean['length_of_connection'].apply(lambda x: x.days)
        feature_list = ['line_id','hotspot_kb','total_kb','voice_count_total',
                        'voice_min_total','mms_total','sms_total','length_of_connection']
        if training:
            feature_list.append('upgrade')
        
        final_network = network_mean[feature_list]
        final_network['used_network'] = 1
        
        total = set(upgrades['line_id'])
        net = set(final_network['line_id'])
        left_out = list(total.difference(net))
        
        not_connect = final_network
        not_connect = not_connect[0:0]
        not_connect['line_id'] = left_out
        not_connect.fillna(0, inplace=True)
        
        if training:
            not_connect.drop(['upgrade'], axis=1, inplace = True)
        
        not_connect = not_connect.merge(upgrades, left_on='line_id', right_on='line_id')
        
        final_network = pd.concat([final_network, not_connect])
        final_network.drop(['date_observed'], axis=1, inplace = True)
        
        if training:
            final_network.upgrade.replace(('yes', 'no'), (1, 0), inplace=True)
        
        return final_network
    
    def merge(training:bool,clean_network, clean_customer, clean_lrp, clean_phone_info) -> pd.DataFrame: # nortons merge goes here
        # drop upgrade column
        if training:
            clean_network.drop(['upgrade'], axis=1, inplace=True)
            clean_customer.drop(['upgrade'], axis=1, inplace=True)
            clean_phone_info.drop(['upgrade'], axis=1, inplace=True)
        
        df = clean_lrp.merge(clean_customer, on='line_id', how='left')
        df = df.merge(clean_phone_info, on='line_id', how='left')
        df = df.merge(clean_network, on='line_id', how='left')
        return df

## Train Class

In [9]:
class Train:
    def run_training(training_data:pd.DataFrame) -> RandomForestClassifier:
        X, y = Train.split_x_y(training_data)
        model = Train.train_model(X, y)
        return model

    def split_x_y(training_data:pd.DataFrame) -> (np.array, np.array):
        X = training_data.drop([TARGET, ID], axis=1).values
        y = training_data[TARGET].values
        return X, y

    def train_model(X:np.array, y:np.array) -> RandomForestClassifier:
        model = RandomForestClassifier(
            bootstrap=False, max_depth=65,max_features='sqrt', 
            min_samples_leaf=1,min_samples_split=6,n_estimators=1260
            )
        model.fit(X, y)
        return model

## Predict Class

In [35]:
class Predict:
    def run_prediction(model:RandomForestClassifier, test_data:pd.DataFrame, training_data:pd.DataFrame) -> (np.array,np.array):
        reshaped_test_data = Predict.reshape_test_data(training_data, test_data)
        predictions = Predict.get_predictions(model, reshaped_test_data)
        return predictions

    def reshape_test_data(training_data:pd.DataFrame, test_data:pd.DataFrame) -> pd.DataFrame:
        training_data_cols = set(training_data.columns)
        test_data_cols = set(test_data.columns)
        cols_to_add = list(training_data_cols - test_data_cols)
        cols_to_drop = list(test_data_cols - training_data_cols)
        
        reshaped_test_data = test_data
        cols_to_drop = Helper.get_nonmissing_cols_and_warn(test_data, cols_to_drop)
        if len(cols_to_drop) > 0:
            reshaped_test_data = test_data.drop(cols_to_drop, axis=1)
        
        padding = pd.DataFrame(0, index=np.arange(len(reshaped_test_data)), columns=list(cols_to_add))
        padding.drop('upgrade', axis=1, inplace=True)
        
        reshaped_test_data = pd.concat([reshaped_test_data, padding], axis=1)
        return reshaped_test_data
        
    def get_predictions(model: RandomForestClassifier, test_data: pd.DataFrame) -> np.array:
        X = test_data.drop([ID], axis=1).values
        predictions = model.predict(X)
        return predictions

## Submit

In [55]:
class Submit:
    def make_submission(ids, predictions: np.array) -> None:
        submission_df = Submit.generate_submission_df(ids, predictions)
        Submit.save_csv(submission_df)
        print('Submitted')

    def generate_submission_df(ids, predictions: np.array) -> pd.DataFrame:
        submission_dict = {
            ID     : ids,
            TARGET : predictions
        }
        submissions_df = pd.DataFrame(submission_dict)
        return submissions_df
    
    def save_csv(submission_df):
        submission_df.to_csv(SUBMISSION_PATH,header=True,index=None)

## Pipeline Class

In [63]:
def main():
    '''
    This is the main training and testing pipeline for the model note that this draws from the results of 
    all of the data exploration from other scripts.
    '''
    train_clean_network = Preprocessing.preprocess_network_usage_domestic(training=True) # works!
    train_clean_customer = Preprocessing.preprocess_customer_status(training=True) # works!
    train_clean_lrp = Preprocessing.preprocess_lrp(training=True) # works !
    train_clean_phone_info = Preprocessing.preprocess_phone_info(training=True) # works !
    training_dataset = Preprocessing.merge(True, train_clean_network, train_clean_customer, train_clean_lrp, train_clean_phone_info)

    model = Train.run_training(training_dataset)
    
    test_clean_network = Preprocessing.preprocess_network_usage_domestic(training=False) # works!
    test_clean_customer = Preprocessing.preprocess_customer_status(training=False) # works!
    test_clean_lrp = Preprocessing.preprocess_lrp(training=False) # works !
    test_clean_phone_info = Preprocessing.preprocess_phone_info(training=False) # works !
    test_dataset = Preprocessing.merge(False, test_clean_network, test_clean_customer, test_clean_lrp, test_clean_phone_info)

    predictions = Predict.run_prediction(model, test_dataset, training_dataset)
    Submit.make_submission(test_dataset[ID].values, predictions)


In [64]:
main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

Submitted
