In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


def optimize_floats(dataframe):
    float_col = dataframe.select_dtypes(include=['float64']).columns.tolist()
    dataframe[float_col] = dataframe[float_col].apply(pd.to_numeric, downcast='float')
    return dataframe


def read_data(path="dataset/train.csv"):
    
    '''
    Reading the data and optimizing the column dtypes to 
    minimal suitable dtype for memory efficiency.
    Dropping the Recording time column.
    '''
    data_orig = pd.read_csv(path)
    data_orig = optimize_floats(data_orig)
    data = data_orig.drop(["Recording_time_ID"], axis=1)
    return(data, data_orig)


def label_encode(to_encode, train, inv=False):
    
    '''
    Action column is string eg. Response-1.
    Encoding the column values to integers.
    '''
    
    label_encoder = LabelEncoder()
    label_encoder.fit(train[["Action"]].values)
    
    if not inv:
        to_encode[["Action"]] = label_encoder.transform(to_encode[["Action"]].values)
        to_encode[["Action"]] = to_encode[["Action"]].astype('int')
        return(to_encode)
    else:
        encoded = label_encoder.inverse_transform(to_encode)
        return(encoded)


def feature_creation(data, c1, c2):
    '''
    Calculating the similarity between the columns 
    like left hand and right hand movement. 
    '''
    
    df_similarity = pd.DataFrame(
        np.diagonal(
            cosine_similarity(
                data[c1], 
                data[c2],
            )
        )
    )
    return(df_similarity)
    
def feature_generator(data):
    
    '''
    Generating the extra features using the probable related columns.
    '''
    col_dict = {
        'df_lh_rh':[['lhx', 'lhy', 'lhz'], ['rhx', 'rhy', 'rhz']],
        'df_hd_lh':[['hx', 'hy', 'hz'], ['lhx', 'lhy', 'lhz']], 
        'df_hd_rh':[['hx', 'hy', 'hz'], ['rhx', 'rhy', 'rhz']], 
        'df_hd_sp':[['hx', 'hy', 'hz'], ['sx', 'sy', 'sz']], 
        'df_lh_lw':[['lhx', 'lhy', 'lhz'], ['lwx', 'lwy', 'lwz']], 
        'df_rh_rw':[['rhx', 'rhy', 'rhz'], ['rwx', 'rwy', 'rwz']], 
        'df_vlh_vrh':[['vlhx', 'vlhy', 'vlhz'], ['vrhx', 'vrhy', 'vrhz']], 
        'df_vlw_vrw':[['vlwx', 'vlwy', 'vlwz'], ['vrwx', 'vrwy', 'vrwz']], 
        'df_alh_arh':[['alhx', 'alhy', 'alhz'], ['arhx', 'arhy', 'arhz']], 
        'df_alw_arw':[['alwx', 'alwy', 'alwz'], ['arwx', 'arwy', 'arwz']]
    }
    
    data_appended = data
    for k,v in col_dict.items():
        df = feature_creation(data, v[0], v[1]).rename(columns={0:k})
        data_appended = pd.concat([df, data_appended], axis=1)
    
    return(data_appended)
        

def feature_scaling(X_train, x):
    
    '''
    Normalising and Scaling the features for better model training.
    '''
    sc = StandardScaler()
    sc.fit(X_train)
    X_scaled = sc.transform(x)
    return(X_scaled)
    
def refit_al(test_data, data_label_encoded, model, thres=0.9):
    
    '''
    Since the dataset is a little imbalanced 
    Active Learning can be used to find the predictions 
    for the test data points. The data points where the model 
    predicts the class with more than thres=90% probability 
    can be considered to be true class.
    Further since the data points for Response-2, Response-4 
    and Response-5 are low, only those particular data points 
    can be added and model can be retrained.
    '''
    preds = model.predict(test_data)
    max_prob = np.amax(model.predict_proba(test_data), axis=1)
    res_ind = np.nonzero(np.where(max_prob > thres, max_prob, 0))
    
    action = pd.DataFrame(preds[res_ind[0]]).rename(columns={0:'Action'})
    df_confident = pd.DataFrame(test_data.iloc[res_ind].values).rename(columns=dict(enumerate(test_data.columns)))
    df_confident = pd.concat([df_confident, action], axis=1)
    
    df_confident_1 = df_confident[df_confident['Action']==1]
    df_confident_3 = df_confident[df_confident['Action']==3]
    df_confident_4 = df_confident[df_confident['Action']==4]
    
    train_data_new = pd.concat([data_label_encoded, df_confident_1, df_confident_3, df_confident_4], axis=0)
    
    y = np.ravel(train_data_new[['Action']].values)
    X = train_data_new.drop(['Action'], axis=1)
    
    model.fit(X, y)
    return(model)
    
def parameter_tuning(X_train, y_train):
    x_train = feature_scaling(X_train, X_train)
    n_estimators = [int(x) for x in linspace(start=100, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    
    random_grid = {'n_estimators' : n_estimators, 'max_features' : max_features, 'max_depth' : max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf' : min_samples_leaf, 'bootstrap': bootstrap}
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1).fit(x_train, y_train)
    return(rf_random)

def model_fitting(X, y, smote=False):
    if smote==True:
        X_train, y_train = smote_data_generation(X, y)
        rf_random = parameter_tuning(X_train, y_train)
        model = rf_random.best_estimator_
    else:
        model = RandomForestClassifier(n_estimators=500, min_samples_split=2, max_depth=40, min_samples_leaf=1, bootstrap=False).fit(X, y)
        model.fit(X_train, y_train)
    return(model)


def traini
data, data_orig = read_data()
data_label_encoded = label_encode(to_encode=data, train=data_orig)
data_feature_extracted = feature_generation(data_label_encoded)

y = data_label_encoded[['Action']].values
X = data_label_encoded.drop(['Action'], axis=1)

model = model_fitting(X, y, smote=False)

'''
In case the smote is not used we can get good test results 
by using Active Learning. So here only incase of not using smote
we are using AL as only lower frequent classes are being added 
in the train set.
'''

test_data, test_orig = read_data('dataset/test.csv')
i = 0
while i < 4:
    model = refit_al(test_data, data_label_encoded, model, thres=0.96) 
    i+=1

predictions = model.predict(test_data)
predicted_actions = label_encode(to_encode=predictions, train=data_orig, inv=True)
df_pred = pd.DataFrame(data=predicted_actions).rename(columns={0:'Action'})

pd.concat([test_orig[['Recording_time_ID']], df_pred], axis=1).to_csv('submission.csv', index=False)