**Predicting Problematic Internet Use using Multiple Linear Regression**

The goal of this notebook is to predict problematic internet use for the Child Mind Institute's Kaggle competition.

This work was done by
Aaron Weinberg, Emilie Wiesner, and Dan Visscher at Ithaca College

All code that was used to develop this model is available on GitHub: https://github.com/aarondweinberg/CMI_problematic_internet_use

**Import Packages and Classes**

In [40]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os #reading and writing files
from imblearn.over_sampling import SMOTE
from sklearn.metrics import cohen_kappa_score
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

**Our Custom Imputer and Zone Computer**

This is an iterative imputer used to fill in missing values in the predictors.

This section also includes functions to compute "Zone" values for the FitnessGram and PAQ variables using documentation found online

In [2]:
## Define custom iterative imputer ("MICE")
class Custom_MICE_Imputer(BaseEstimator, TransformerMixin):
    # Class Constructor 
    # This allows you to initiate the class when you call Custom_KNN_Imputer
    def __init__(self):
        # I want to initiate each object with both a KNNImputer and StandardScaler object/method
        self.MICEImputer = IterativeImputer(max_iter=10, random_state=497)

    
    # For my fit method I'm just going to "steal" IterativeImputers's fit method using a curated collection of predictors
    def fit(self, Z, y = None):
        feature_list = Z.columns.tolist()
        if 'id' in feature_list:
            feature_list.remove('id')
        if 'sii' in feature_list:
            feature_list.remove('sii')
        feature_list = [x for x in feature_list if 'PCIAT' not in x]
        feature_list = [x for x in feature_list if 'Zone' not in x]
        feature_list = [x for x in feature_list if 'Season' not in x]
        Z = Z.reset_index(drop=True)
        self.MICEImputer.fit(Z[feature_list])
        return self
    
    # Now I want to transform the columns in feature list and return it with imputed values that have been un-transformed
    def transform(self, Z, y = None):
        feature_list = Z.columns.tolist()
        if 'id' in feature_list:
            feature_list.remove('id')
        if 'sii' in feature_list:
            feature_list.remove('sii')
        feature_list = [x for x in feature_list if 'PCIAT' not in x]
        feature_list = [x for x in feature_list if 'Zone' not in x]
        feature_list = [x for x in feature_list if 'Season' not in x]
        copy_Z = Z.copy()
        copy_Z = copy_Z.reset_index(drop=True)
        df2 = self.MICEImputer.transform(copy_Z[feature_list])
        df3 = pd.DataFrame(df2, columns=feature_list)
        copy_Z[feature_list]=copy_Z[feature_list].fillna(df3[feature_list])
        return copy_Z
    

####Now defining zone functions.

# Compute values for the 'FGC-FGC_SR_Zone' that is equal to 1 if any of the following are true:
# Basic_Demos-Sex==0 and FGC-FGC_SR >= 8
# Basic_Demos-Sex==1 and FGC-FGC_SR >= 9 and Basic_Demos-Age is between 5 and 10
# Basic_Demos-Sex==1 and FGC-FGC_SR >= 10 and Basic_Demos-Age is between 11 and 14
# Basic_Demos-Sex==1 and FGC-FGC_SR >= 12 and Basic_Demos-Age is at least 15
# Note that Basic_Demos-Sex is coded as 0=Male and 1=Female

def sitreachzone(sex, age, sr):
    try:
        if np.isnan(sr) or np.isnan(sex) or np.isnan(age):
            return np.nan
        elif sex == 0 and sr>=8:
            return 1
        elif sex == 1 and age >= 15 and sr >= 12:
            return 1
        elif sex == 1 and age >= 11 and sr >= 10:
            return 1
        elif sex == 1 and age >= 5 and sr >= 9:
            return 1
        else:
            return 0
    except:
        return np.nan

# Compute values for the 'FGC-FGC_CU_Zone' that is equal to 1 if any of the following are true:
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 2 and Basic_Demos-Age is between 5 and 6
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 4 and Basic_Demos-Age is 7
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 6 and Basic_Demos-Age is 8
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 9 and Basic_Demos-Age is 9
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 12 and Basic_Demos-Age is 10
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 15 and Basic_Demos-Age is 11
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 18 and Basic_Demos-Age is 12
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 21 and Basic_Demos-Age is 13
# Basic_Demos-Sex==0 and FGC-FGC_CU >= 24 and Basic_Demos-Age is at least 14
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 2 and Basic_Demos-Age is between 5 and 6
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 4 and Basic_Demos-Age is 7
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 6 and Basic_Demos-Age is 8
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 9 and Basic_Demos-Age is 9
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 12 and Basic_Demos-Age is 10
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 15 and Basic_Demos-Age is 11
# Basic_Demos-Sex==1 and FGC-FGC_CU >= 18 and Basic_Demos-Age is at least 12

def curlupzone(sex, age, cu):
    try:
        if np.isnan(sex) or np.isnan(age) or np.isnan(cu):
            return np.nan
        elif sex == 0:
            if (age >= 14 and cu >= 24) or (age == 13 and cu >= 21) or (age == 12 and cu >= 18) or (age == 11 and cu >= 15) or (age == 10 and cu >= 12) or (age == 9 and cu >= 9) or (age == 8 and cu >= 6) or (age == 7 and cu >= 4) or (age <= 6 and cu >= 2):
                return 1
            else:
                return 0
        elif sex == 1:
            if (age >= 12 and cu >= 18) or (age == 11 and cu >= 15) or (age == 10 and cu >= 12) or (age == 9 and cu >= 9) or (age == 8 and cu >= 6) or (age == 7 and cu >= 4) or (age <= 6 and cu >= 2):
                return 1
            else:
                return 0
    except:
        return np.nan

# Compute values for the 'FGC-FGC_PU_Zone' that is equal to 1 if any of the following are true:
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 3 and Basic_Demos-Age is between 5 and 6
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 4 and Basic_Demos-Age is 7
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 5 and Basic_Demos-Age is 8
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 6 and Basic_Demos-Age is 9
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 7 and Basic_Demos-Age is 10
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 8 and Basic_Demos-Age is 11
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 10 and Basic_Demos-Age is 12
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 12 and Basic_Demos-Age is 13
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 14 and Basic_Demos-Age is 14
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 16 and Basic_Demos-Age is 15
# Basic_Demos-Sex==0 and FGC-FGC_PU >= 18 and Basic_Demos-Age is at least 16
# Basic_Demos-Sex==1 and FGC-FGC_PU >= 3 and Basic_Demos-Age is between 5 and 6
# Basic_Demos-Sex==1 and FGC-FGC_PU >= 4 and Basic_Demos-Age is 7
# Basic_Demos-Sex==1 and FGC-FGC_PU >= 5 and Basic_Demos-Age is 8
# Basic_Demos-Sex==1 and FGC-FGC_PU >= 6 and Basic_Demos-Age is 9
# Basic_Demos-Sex==1 and FGC-FGC_PU >= 7 and Basic_Demos-Age is at least 10

def pullupzone(sex, age, pu):
    try:
        if np.isnan(sex) or np.isnan(age) or np.isnan(pu):
            return np.nan
        elif sex == 0:
            if (age >= 16 and pu >= 18) or (age == 15 and pu >= 16) or (age == 14 and pu >= 14) or (age == 13 and pu >= 12) or (age == 12 and pu >= 10) or (age == 11 and pu >= 8) or (age == 10 and pu >= 7) or (age == 9 and pu >= 6) or (age == 8 and pu >= 5) or (age == 7 and pu >= 4) or (age <= 6 and pu >= 2):
                return 1
            else:
                return 0
        elif sex == 1:
            if (age >= 10 and pu >= 7) or (age == 9 and pu >= 6) or (age == 8 and pu >= 5) or (age == 7 and pu >= 4) or (age <= 6 and pu >= 3):
                return 1
            else:
                return 0
    except:
        return np.nan

# Comtlte values for the 'FGC-FGC_TL_Zone' that is equal to 1 if any of the following are true:
# FGC-FGC_TL >= 6 and Basic_Demos-Age is between 5 and 9
# FGC-FGC_TL >= 9 and Basic_Demos-Age is at least 10

def tlzone(age, tl):
    try:
        if np.isnan(tl) or np.isnan(age):
            return np.nan
        elif (age >= 10 and tl >= 9) or (age <= 9 and tl >= 6):
            return 1
        else:
            return 0
    except:
        return np.nan

# Comtlte values for the 'PAQ_MVPA' that is equal to 1 if any of the following are true:
# PAQ_Total >= 2.73 and Basic_Demos-Age is between 5 and 13
# PAQ_Total >= 2.75 and Basic_Demos-Age is at least 14

def paqzone(age, paq):
    try:
        if np.isnan(paq) or np.isnan(age):
            return np.nan
        elif (age >= 14 and paq >= 2.75) or (age <= 13 and paq >= 2.73):
            return 1
        else:
            return 0
    except:
        return np.nan

###Custom encoder function
# 
def zone_encoder(df):
    df_copy = df.copy()

    if 'FGC-FGC_SR_Zone' in df_copy.columns:
        if 'Basic_Demos-Age' in df_copy.columns and 'Basic_Demos-Sex' in df_copy.columns and 'FGC-FGC_SR' in df_copy.columns:
            df_copy['FGC-FGC_SR_Zone'] = df_copy.apply(lambda x: sitreachzone(x['Basic_Demos-Sex'], x['Basic_Demos-Age'], x['FGC-FGC_SR']), axis=1)
        else:
            df_copy['FGC-FGC_SR_Zone'] = df_copy['FGC-FGC_SR_Zone'].fillna(df_copy['FGC-FGC_SR_Zone'].mean())
    if 'FGC-FGC_CU_Zone' in df_copy.columns:
        if 'Basic_Demos-Age' in df_copy.columns and 'Basic_Demos-Sex' in df_copy.columns and 'FGC-FGC_CU' in df_copy.columns:
            df_copy['FGC-FGC_CU_Zone'] = df_copy.apply(lambda x: curlupzone(x['Basic_Demos-Sex'], x['Basic_Demos-Age'], x['FGC-FGC_CU']), axis=1)
        else:
            df_copy['FGC-FGC_CU_Zone'] = df_copy['FGC-FGC_CU_Zone'].fillna(df_copy['FGC-FGC_CU_Zone'].mean())
    if 'FGC-FGC_PU_Zone' in df_copy.columns:
        if 'Basic_Demos-Age' in df_copy.columns and 'Basic_Demos-Sex' in df_copy.columns and 'FGC-FGC_PU' in df_copy.columns:
            df_copy['FGC-FGC_PU_Zone'] = df_copy.apply(lambda x: pullupzone(x['Basic_Demos-Sex'], x['Basic_Demos-Age'], x['FGC-FGC_PU']), axis=1)
        else:
            df_copy['FGC-FGC_PU_Zone'] = df_copy['FGC-FGC_PU_Zone'].fillna(df_copy['FGC-FGC_PU_Zone'].mean())
    if 'FGC-FGC_TL_Zone' in df_copy.columns:
        if 'Basic_Demos-Age' in df_copy.columns and 'FGC-FGC_TL' in df_copy.columns:
            df_copy['FGC-FGC_TL_Zone'] = df_copy.apply(lambda x: tlzone(x['Basic_Demos-Age'], x['FGC-FGC_TL']), axis=1)
        else:
            df_copy['FGC-FGC_TL_Zone'] = df_copy['FGC-FGC_TL_Zone'].fillna(df_copy['FGC-FGC_TL_Zone'].mean())
    if 'PAQ_Zone' in df_copy.columns:
        if 'Basic_Demos-Age' in df_copy.columns and 'PAQ_Total' in df_copy.columns:
            df_copy['PAQ_Zone'] = df_copy.apply(lambda x: tlzone(x['Basic_Demos-Age'], x['PAQ_Total']), axis=1)
        else:
            df_copy['PAQ_Zone'] = df_copy.apply(lambda x: paqzone(x['Basic_Demos-Age'], x['PAQ_Total']), axis=1)
    return df_copy   

**Code the Accelerometer Data**

We segment each actigraphy file into 5-minute bouts

For each bout, we indicate whether the mean ENMO value is above thresholds identified in the research literature

We also identify how often the anglez was positive

Then we create a dataframe that records these values for each participant ID

In [3]:
#ENMO cutoffs in mg for MVPA
mvpa_cutoff1 = 0.192
mvpa_cutoff2 = 0.110

# Number of 'active' bouts required for a day to count as 'active'
active_bout_cutoff = 150

# Specify the length of the bouts
boutlength = '5min'

# Maximum number of 5-minute bouts that can be imputed as zeroes to account for the accelerometer not collected data when at rest
impute_max = 6

# Minimum number of 5-second intervals (within a 5-minute bout) that need to have data for the bout to be counted
impute_sec_min = 29

# Create a new data frame with columns 'ID', 'ENMO_Avg_Active_Days_MVPA192', 'ENMO_Avg_Active_Days_MVPA110', and 'Positive_Anglez_Active_Days
accel = pd.DataFrame(columns=['ID', 'ENMO_Avg_Active_Days_MVPA192', 'ENMO_Avg_Active_Days_MVPA110', 'Positive_Anglez_Active_Days'])


# Walk through the files in the directory
# For testing purposes, we'll just do this for test data
for dirname, _, filenames in os.walk('/kaggle/input/child-mind-institute-problematic-internet-use'):
#for dirname, _, filenames in os.walk('/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        #Check to see if filename is a parquet file; if it is, read the file and extract the ID from the directory name
        if filename.endswith('.parquet'):
            data = pd.read_parquet(os.path.join(dirname, filename))
            id = dirname[-8:]

            # Remove any rows where the variable non-wear_flag is nonzero
            data = data[data['non-wear_flag'] == 0]

            # Change the time_of_day variable to a datetime and make it into the index
            data['dt'] = pd.to_datetime(data['time_of_day'])
            data['dt_mod'] = data['dt'] + pd.to_timedelta(data['relative_date_PCIAT'], unit='D')
            data.set_index('dt_mod', inplace=True)

            # Create a new data frame that counts the number of valid data points within each 5-minute ('boutlength') interval
            # This will later be used to exclude intervals that had fewer than 30 (out of 60) valid data points
            data['count'] = 1
            number_of_data_points = data.resample(boutlength).agg({'count':'sum'})
            data.drop('count', axis=1, inplace=True)

            # Create 5-minute "bouts" of averaged data and incorporate the number of valid data points within each interval as a new variable 'count'
            data_resampled_5min = data.resample(boutlength).mean()
            data_resampled_5min = data_resampled_5min.merge(number_of_data_points, left_index=True, right_index=True)

            # Some of the accelerometers stopped collecting data if they were stationary (but still on/worn)
            # This next section is an attempt to identify and fill in these seemingly missing values with "0" for the enmo value
            # It does this by identifying the length of each sequence of NaN values and filling them with 0 if thery are at most 30 minutes long
            # This also restricts this process to 5-minute bouts that had data for at least 30 of the 5-second-intervals within the bout
            data_resampled_5min['enmogroup'] = data_resampled_5min['enmo'].notna().cumsum()
            enmogroupcount = data_resampled_5min.groupby(by=["enmogroup"]).size().to_frame()
            enmogroupcount = enmogroupcount.rename(columns={0: 'enmogroupsize'})
            data_resampled_5min = data_resampled_5min.merge(enmogroupcount, how='left', left_on='enmogroup', right_index=True)
            data_resampled_5min['smallinterval'] = (data_resampled_5min['enmogroupsize'] < impute_max+2) & (data_resampled_5min['count']>impute_sec_min)
            data_resampled_5min['filled_enmo'] = np.where(data_resampled_5min.smallinterval, data_resampled_5min.enmo.fillna(0), data_resampled_5min.enmo)

            # Also fill in only anglez values where the count is large enough
            data_resampled_5min['filled_anglez'] = np.where(data_resampled_5min['count']>impute_sec_min, data_resampled_5min.anglez, np.nan)

            # The next code chunk will create a new data frame that lists the total number of valid bouts for the participant
            # and will count the number of bouts with filled_enmo values over a particular threshold
            # and then count the number of bouts with positive anglez values

            # Start by counting the number of valid bouts in each day as a data frame
            boutcount_filled = data_resampled_5min.groupby(data_resampled_5min.index.date).count()['filled_enmo'].to_frame()
            boutcount_filled = boutcount_filled.rename(columns={'filled_enmo': 'valid_bouts'})

            # Count the number of bouts in each day with filled_enmo at least mvpa_cutoff1
            boutcount_MVPA1 = data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff1].groupby(data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff1].index.date).count()['filled_enmo'].to_frame()
            boutcount_MVPA1 = boutcount_MVPA1.rename(columns={'filled_enmo': 'MVPA_bouts_over_cutoff1'})
            boutcount = boutcount_filled.merge(boutcount_MVPA1, how='left', left_index=True, right_index=True)

            # Count the number of bouts in each day with filled_enmo at least mvpa_cutoff2
            boutcount_MVPA2 = data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff2].groupby(data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff2].index.date).count()['filled_enmo'].to_frame()
            boutcount_MVPA2 = boutcount_MVPA2.rename(columns={'filled_enmo': 'MVPA_bouts_over_cutoff2'})
            boutcount = boutcount.merge(boutcount_MVPA2, how='left', left_index=True, right_index=True)

            # Count the number of bouts in each day with anglez at least 0
            boutcount_anglez = data_resampled_5min[data_resampled_5min['filled_anglez'] > 0].groupby(data_resampled_5min[data_resampled_5min['filled_anglez'] > 0].index.date).count()['filled_anglez'].to_frame()
            boutcount_anglez = boutcount_anglez.rename(columns={'filled_anglez': 'Positive_Anglez_Bouts'})
            boutcount = boutcount.merge(boutcount_anglez, how='left', left_index=True, right_index=True)

            # Compute a new variable 'included_day' to be True if valid_bouts is at least active_bout_cutoff
            boutcount['included_day'] = boutcount['valid_bouts'] >= active_bout_cutoff

            # Compute the mean of MVPA bouts over each cutoff
            # Note: We are only using the "included day" data in our final analysis, so we'll restrict the output accordingly
            MVPA_mean1 = boutcount[boutcount['included_day'] == True]['MVPA_bouts_over_cutoff1'].mean()
            MVPA_mean2 = boutcount[boutcount['included_day'] == True]['MVPA_bouts_over_cutoff2'].mean()
            Anglez_mean1 = boutcount[boutcount['included_day'] == True]['Positive_Anglez_Bouts'].mean()

            new_row = pd.DataFrame({"ID": [id], "ENMO_Avg_Active_Days_MVPA192": [MVPA_mean1], "ENMO_Avg_Active_Days_MVPA110": [MVPA_mean2], "Positive_Anglez_Active_Days": [Anglez_mean1]})

            # Replace any NaN values in df with 0
            new_row.fillna(0, inplace=True)
            
            #Create a new row in accel where 'ID'=id, 'ENMO_Avg_Active_Days_MVPA192'=MVPA_mean1, 'ENMO_Avg_Active_Days_MVPA110'=MVPA_mean2, and 'Positive_Anglez_Active_Days'=Anglez_mean1
            accel = pd.concat([accel, new_row], ignore_index=True)

  accel = pd.concat([accel, new_row], ignore_index=True)


**Load the Data and Join with Accelerometer Data**

We noticed that the versions of these files stored in /kaggle/input included some duplicate rows, so we drop these.

In [30]:
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

# Join train/test and accel  on the 'id' column and accel on the 'ID' column
test_original = test.join(accel.set_index('ID'), on='id', how='left')
train_original = train.join(accel.set_index('ID'), on='id', how='left')

# Remove duplicate rows
test = test_original.drop_duplicates()
train = train_original.drop_duplicates()


**Data Cleaning**

This section creates some new variables
* Sit & Reach is the average of the "Left" and "Right" Sit & Reach predictors
* A Zone variable for the combined Sit & Reach variable
* Combining the PAQ-A and PAQ-C into a single PAQ predictor
* Creating a "Zone" for PAQ based on cutoffs in the research literature
* Combining the minutes and seconds of the Fitness Endurance variable
* Dropping SDS_T, which seems to be a duplicate of SDS
* Removing negative values from numerical predictors
* Removing 0 values from physical predictors
* Removing outliers

In [31]:
###################################
#Now we do a variety of data cleaning.
# Create a new variable 'FGC-FGC_SR' that is the mean of FGC-FGC_SRL and FGC-FGC_SRR
train['FGC-FGC_SR'] = train[['FGC-FGC_SRL', 'FGC-FGC_SRR']].mean(axis=1)
test['FGC-FGC_SR'] = test[['FGC-FGC_SRL', 'FGC-FGC_SRR']].mean(axis=1)

# Remove the old sit & reach variables
train = train.drop(columns=['FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone'])
test = test.drop(columns=['FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone'])

# Create a new variable 'FGC-FGC_SR_Zone' that is equal to 1 if any of the following are true:
# Basic_Demos-Sex==0 and FGC-FGC_SR >= 8
# Basic_Demos-Sex==1 and FGC-FGC_SR_Zone >= 9 and Basic_Demos-Age is between 5 and 10
# Basic_Demos-Sex==1 and FGC-FGC_SR_Zone >= 10 and Basic_Demos-Age is between 11 and 14
# Basic_Demos-Sex==1 and FGC-FGC_SR_Zone >= 12 and Basic_Demos-Age is at least 15

# One way to do this is to define a function that would take sex, age, and SR value as inputs and output 1 or 0
def sitreachzone(sex, age, sr):
    try:
        if np.isnan(sr):
            return np.nan
        elif sex == 0 and sr>=8:
            return 1
        elif sex == 1 and age >= 15 and sr >= 12:
            return 1
        elif sex == 1 and age >= 11 and sr >= 10:
            return 1
        elif sex == 1 and age >= 5 and sr >= 9:
            return 1
        else:
            return 0
    except:
        return np.nan

# Apply sitreachzone to create a new column using the columns Basic_Demos-Sex, Basic_Demos-Age, and FGC-FGC_SR as inputs
train['FGC-FGC_SR_Zone'] = train.apply(lambda x: sitreachzone(x['Basic_Demos-Sex'], x['Basic_Demos-Age'], x['FGC-FGC_SR']), axis=1)
test['FGC-FGC_SR_Zone'] = test.apply(lambda x: sitreachzone(x['Basic_Demos-Sex'], x['Basic_Demos-Age'], x['FGC-FGC_SR']), axis=1)

# Create a new variable that is 1 when PAQA/C Total is at least 2.75/2.73, 0 if it's less than these cutoffs, and NaN if PAQA/C is NaN
train['PAQA_Zone'] = np.where(train['PAQ_A-PAQ_A_Total']>=2.75, 1, 0)
train['PAQA_Zone'] = np.where(train['PAQ_A-PAQ_A_Total'].isnull(), np.nan, train['PAQA_Zone'])
train['PAQC_Zone'] = np.where(train['PAQ_C-PAQ_C_Total']>=2.73, 1, 0)
train['PAQC_Zone'] = np.where(train['PAQ_C-PAQ_C_Total'].isnull(), np.nan, train['PAQC_Zone'])
test['PAQA_Zone'] = np.where(test['PAQ_A-PAQ_A_Total']>=2.75, 1, 0)
test['PAQA_Zone'] = np.where(test['PAQ_A-PAQ_A_Total'].isnull(), np.nan, test['PAQA_Zone'])
test['PAQC_Zone'] = np.where(test['PAQ_C-PAQ_C_Total']>=2.73, 1, 0)
test['PAQC_Zone'] = np.where(test['PAQ_C-PAQ_C_Total'].isnull(), np.nan, test['PAQC_Zone'])

# Create new variables that merge the three PAQA/C variables
train['PAQ_Total']=train['PAQ_C-PAQ_C_Total']
train.loc[train['PAQ_Total'].isnull(),'PAQ_Total']=train['PAQ_A-PAQ_A_Total']
test['PAQ_Total']=test['PAQ_C-PAQ_C_Total']
test.loc[test['PAQ_Total'].isnull(),'PAQ_Total']=test['PAQ_A-PAQ_A_Total']

train['PAQ_Season']=train['PAQ_C-Season']
train.loc[train['PAQ_Season'].isnull(),'PAQ_Season']=train['PAQ_A-Season']
test['PAQ_Season']=test['PAQ_C-Season']
test.loc[test['PAQ_Season'].isnull(),'PAQ_Season']=test['PAQ_A-Season']

train['PAQ_Zone']=train['PAQC_Zone']
train.loc[train['PAQ_Zone'].isnull(),'PAQ_Zone']=train['PAQA_Zone']
test['PAQ_Zone']=test['PAQC_Zone']
test.loc[test['PAQ_Zone'].isnull(),'PAQ_Zone']=test['PAQA_Zone']

# Drop the PAQ variables we no longer need
train=train.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_A-Season', 'PAQA_Zone', 'PAQC_Zone'])
test=test.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_A-Season', 'PAQA_Zone', 'PAQC_Zone'])

# Combine the minutes and seconds of Fitness_Endurance into a single number (total number of seconds)
train['Fitness_Endurance_Total_Time_Sec'] = train['Fitness_Endurance-Time_Mins'] * 60 + train['Fitness_Endurance-Time_Sec']
test['Fitness_Endurance_Total_Time_Sec'] = test['Fitness_Endurance-Time_Mins'] * 60 + test['Fitness_Endurance-Time_Sec']

train=train.drop(columns=['Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'])
test=test.drop(columns=['Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'])

# Remove the SDS-SDS_Total_T variable from train
train=train.drop(columns=['SDS-SDS_Total_T'])
test=test.drop(columns=['SDS-SDS_Total_T'])

# Remove the FGC-FGC_GSND, FGC-FGC_GSND_Zone, FGC-FGC_GSD, and FGC-FGC_GSD_Zone variables
train=train.drop(columns=['FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone'])
test=test.drop(columns=['FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone'])

# Create a list of numerical columns of type float. Note that these columns include the "Zone" variables which are really categorical/ordinal:
float_columns = test.select_dtypes(include=['float']).columns

# Change negative values to NaN
train[train[float_columns] < 0] = np.nan
test[test[float_columns] < 0] = np.nan

# For each variable that starts with 'Physical-' replace any values that are 0 with NaN
for column in train.columns:
    if column.startswith('Physical-'):
        train[column] = train[column].replace(0, np.nan)
for column in test.columns:
    if column.startswith('Physical-'):
        test[column] = test[column].replace(0, np.nan)

# For each column in float_columns, identify entries that are 5 standard deviations above or below the mean and replace them with NaN
for column in float_columns:
    train[column] = train[column].mask(train[column] > train[column].mean() + 5 * train[column].std())
    train[column] = train[column].mask(train[column] < train[column].mean() - 5 * train[column].std())
    test[column] = test[column].mask(test[column] > test[column].mean() + 5 * test[column].std())
    test[column] = test[column].mask(test[column] < test[column].mean() - 5 * test[column].std())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['FGC-FGC_SR'] = train[['FGC-FGC_SRL', 'FGC-FGC_SRR']].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['FGC-FGC_SR'] = test[['FGC-FGC_SRL', 'FGC-FGC_SRR']].mean(axis=1)


In [32]:
# Rename the files for future code compatibility

train_cleaned = train
test_cleaned= test

**Feature Reduction**

We noticed a large correlation (>0.98) between some predictors. We'll remove these, as well as the season variables, which we won't use in our predictions.

In [33]:
# Remove some variables we won't need

# Remove the variable 'BIA-BIA_BMI'
train_cleaned = train_cleaned.drop(['BIA-BIA_BMI'], axis=1)

# Remove FGC-FGC_CU_Zone, FGC-FGC_PU_Zone, and FGC-FGC_TL_Zone
train_cleaned = train_cleaned.drop(['FGC-FGC_CU_Zone', 'FGC-FGC_PU_Zone', 'FGC-FGC_TL_Zone'], axis=1)

# Remove the following variables from train: BIA-BIA_BMR, BIA-BIA_TBW, BIA-BIA_ECW, BIA-BIA_LDM, BIA-BIA_ICW, BIA-BIA_SMM, BIA-BIA_DEE, BIA-BIA_LST, and BIA-BIA_BMC
train_cleaned=train_cleaned.drop(columns=['BIA-BIA_BMR', 'BIA-BIA_TBW', 'BIA-BIA_ECW', 'BIA-BIA_LDM', 'BIA-BIA_ICW', 'BIA-BIA_SMM', 'BIA-BIA_DEE', 'BIA-BIA_LST', 'BIA-BIA_BMC'])

# Remove the Fitness_Endurance-Max_Stage variable (based on previous exploration)
train_cleaned = train_cleaned.drop(['Fitness_Endurance-Max_Stage'], axis=1)

# Remove all variables with Season in their name
train_cleaned = train_cleaned.loc[:,~train_cleaned.columns.str.contains('Season')]

#We also remove these variables from the test data.
test_cleaned = test_cleaned.drop(['BIA-BIA_BMI'], axis=1)
test_cleaned = test_cleaned.drop(['FGC-FGC_CU_Zone', 'FGC-FGC_PU_Zone', 'FGC-FGC_TL_Zone'], axis=1)
test_cleaned = test_cleaned.drop(columns=['BIA-BIA_BMR', 'BIA-BIA_TBW', 'BIA-BIA_ECW', 'BIA-BIA_LDM', 'BIA-BIA_ICW', 'BIA-BIA_SMM', 'BIA-BIA_DEE', 'BIA-BIA_LST', 'BIA-BIA_BMC'])
test_cleaned = test_cleaned.drop(['Fitness_Endurance-Max_Stage'], axis=1)
test_cleaned = test_cleaned.loc[:,~test_cleaned.columns.str.contains('Season')]

**Impute Missing Outcome Values**

Some participants have missing PCIAT scores. We'll impute these using KNN

In [34]:
#First we'll create a list of columns that hold the PCIAT values
pciats = [col for col in train_cleaned.columns if 'PCIAT' in col]
pciats.remove('PCIAT-PCIAT_Total')

#Create a new copy of the data frame for imputation. Remove rows where all values in pciats are NaN
train_imp_KNN = train_cleaned.copy()
train_imp_KNN['pciatsnotna_sum'] = train_imp_KNN[pciats].notna().sum(axis=1)
train_imp_KNN = train_imp_KNN[train_imp_KNN['pciatsnotna_sum'] != 0]
train_imp_KNN.reset_index(drop=True, inplace=True)

#Remove the pciatsnotna_sum variable
train_imp_KNN.drop(columns=['pciatsnotna_sum'], inplace=True)

#Identify the rows with at least one NaN value
train_imp_KNN['nan_rows'] = train_imp_KNN[pciats].isnull().any(axis=1)

# Create a copy of train_imp_KNN
train_imp_KNN2 = train_imp_KNN.copy()
# define imputer
Number_Neighbors=5
imputer = KNNImputer(n_neighbors=Number_Neighbors, weights='uniform', metric='nan_euclidean')

#The imputer.fit_transform function outputs a numpy array. So first I do the fitting, then convert the output back to a pandas dataframe.

imputations=imputer.fit_transform(train_imp_KNN[pciats])
df2 = pd.DataFrame(imputations, columns=pciats)

#Next take the result and insert into the original dataframe. 

train_imp_KNN[pciats]=train_imp_KNN[pciats].fillna(df2[pciats])

#Remove the nan_rows variable
train_imp_KNN.drop(columns=['nan_rows'], inplace=True)

#Recalculate the PCIAT total score.
train_imp_KNN['PCIAT-PCIAT_Total'] = train_imp_KNN[pciats].sum(axis=1)

#Now we can calculate a new sii score with the imputed values. 
bins = [0, 30, 49,79,101]
labels = [0,1,2,3]
#train_imp_KNN['sii'] = pd.cut(train_imp_KNN['PCIAT-PCIAT_Total'], bins=bins, labels=labels, right=False)
train_imp_KNN['sii'] = np.digitize(train_imp_KNN['PCIAT-PCIAT_Total'], bins=bins)-1

**The Model**

The process:
1. Create a list of predictors
2. Impute missing predictor values for the predictor variables
3. Use SMOTE to oversample the minority class sii=3
4. Use a tuned gradient boosting regressor to predict PCIAT scores
5. Use a "tuned" set of bins to convert PCIAT scores to sii scores
6. Output the prediction

In [41]:
#Create an initial list of predictor columns
predictors = train_cleaned.columns.tolist()
if 'id' in predictors:
    predictors.remove('id')
if 'sii' in predictors:
    predictors.remove('sii')
predictors = [x for x in predictors if 'PCIAT' not in x]
predictors = [x for x in predictors if 'Season' not in x]

# Create an augmented list that will be used for oversampling
predictors_plus = predictors + ['PCIAT-PCIAT_Total']

# Create a list of "key features" based on a previously-run random forest
keyfeatures = ['Basic_Demos-Age',
 'Physical-Height',
 'PreInt_EduHx-computerinternet_hoursday',
 'BIA-BIA_FFM',
 'SDS-SDS_Total_Raw',
 'Physical-Weight',
 'ENMO_Avg_Active_Days_MVPA110',
 'FGC-FGC_CU']


# Impute missing values
mice = Custom_MICE_Imputer()
train_cleaned_imputed = mice.fit_transform(train_imp_KNN)
train_cleaned_imputed = zone_encoder(train_cleaned_imputed)
test_cleaned_imputed = mice.fit_transform(test_cleaned)
test_cleaned_imputed = zone_encoder(test_cleaned_imputed)

# Set up SMOTE to create 148 (rather than 37) instances of sii=3
siiratios = {0: 1530, 1: 765, 2:403, 3:148}
oversample = SMOTE(sampling_strategy=siiratios)

# Oversample with SMOTE
X, y = oversample.fit_resample(train_cleaned_imputed[predictors_plus], train_cleaned_imputed['sii'])

# Create the MLR model
mlr_key_pipe = Pipeline([
                ('selector', ColumnTransformer([('selector', 'passthrough', keyfeatures)], remainder="drop")),
                ('linear', LinearRegression())])

# Fit and make predictions
mlr_key_pipe.fit(X[predictors], X['PCIAT-PCIAT_Total'])
pred = mlr_key_pipe.predict(test_cleaned_imputed[predictors])

# "Tuned" bins.
bins_mod = [0, 27, 39, 79, 100]
pred_bin_mod = np.digitize(pred, bins_mod)-1

# Add predictions to test_cleaned_imputed
test_cleaned_imputed['sii'] = pred_bin_mod

# Create final output df
predicted_sii = test_cleaned_imputed[['id','sii']]



In [36]:
# Save the predicted sii values
predicted_sii.to_csv('submission.csv', index=False)