In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import bernoulli
import pandas as pd
import torch

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from utils import *
from src.model import BoundaryDetectorSimple, BoundaryDetectorAttention

### Define directories

In [3]:
train_dir = '/home/jolteon/eluvio_challenge/data/train/'
val_dir = '/home/jolteon/eluvio_challenge/data/val/'
test_dir = '/home/jolteon/eluvio_challenge/data/test/'

# Random Forest Boundary detection

In [4]:
data_train = make_all_dot_product_features_df(train_dir)
data_val =  make_all_dot_product_features_df(test_dir)
#Combine, since we are going to do crossval
data_train = pd.concat([data_train,data_val])
#seperate X,y
X_train = data_train[['place_dp','cast_dp','action_dp','audio_dp']]
y_train = data_train['boundary_truth']

In [5]:
data_train

Unnamed: 0,place_dp,cast_dp,action_dp,audio_dp,boundary_truth
0,91.192154,0.000000,0.000000,0.584730,0
1,489.716553,0.000000,0.000000,0.231411,0
2,445.407837,0.000000,0.000000,0.364196,0
3,292.007690,0.000000,0.000000,0.850917,0
4,350.747498,0.000000,0.000000,0.737919,0
...,...,...,...,...,...
1868,465.193115,1.518258,4138.864258,0.256540,1
1869,346.203339,0.560668,2765.230713,0.594189,1
1870,265.108948,0.575942,2732.739258,0.731078,1
1871,286.565979,0.641870,3133.891602,0.502469,0


### Easiest Baseline: Logistic Regression Grid on dot product features

In [6]:
%%time
param_grid = {'C' : [.01,.1,.1,5,10,15,20]
             }
clf = GridSearchCV(
        LogisticRegression(max_iter=1000),param_grid,scoring='average_precision')
clf.fit(X_train, y_train)                 


CPU times: user 6.41 s, sys: 939 µs, total: 6.41 s
Wall time: 6.41 s


GridSearchCV(estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [0.01, 0.1, 0.1, 5, 10, 15, 20]},
             scoring='average_precision')

Fitting an Logistic model is extremely fast

In [7]:
clf.cv_results_

{'mean_fit_time': array([0.16861439, 0.17224422, 0.18458252, 0.18493276, 0.17202072,
        0.16369491, 0.159337  ]),
 'std_fit_time': array([0.04111839, 0.04607965, 0.06543112, 0.05023923, 0.0319545 ,
        0.03295975, 0.03188476]),
 'mean_score_time': array([0.0053525 , 0.00531969, 0.00588722, 0.0071476 , 0.00530114,
        0.00532899, 0.00520134]),
 'std_score_time': array([1.12844398e-04, 2.08141064e-04, 1.28733064e-03, 2.76115696e-03,
        9.81238419e-05, 1.20976942e-04, 2.68637545e-05]),
 'param_C': masked_array(data=[0.01, 0.1, 0.1, 5, 10, 15, 20],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.01},
  {'C': 0.1},
  {'C': 0.1},
  {'C': 5},
  {'C': 10},
  {'C': 15},
  {'C': 20}],
 'split0_test_score': array([0.14066426, 0.14109236, 0.14109236, 0.14113767, 0.14113692,
        0.14113731, 0.14113739]),
 'split1_test_score': array([0.14654727, 0.14638424, 0.14638424, 0.1464069 , 0.14

In [8]:
best_model = clf.best_estimator_
print(best_model)

LogisticRegression(C=10, max_iter=1000)


In [10]:
output_dir = '/home/jolteon/eluvio_challenge/logistic_dot_product/'
generate_predictions_dir_LR(best_model,test_dir,output_dir)

# Random Forest / Logistic Regression on difference of embeddings features

In [11]:
X_train,y_train= make_all_embedding_difference_features_df(train_dir)
X_val,y_val =  make_all_embedding_difference_features_df(test_dir)
#Combine, since we are going to do crossval
#data_train = pd.concat([data_train,data_val])
#seperate X,y
X = np.vstack([X_train,X_val])
y = np.concatenate([y_train,y_val])


In [12]:
print(X.shape)
print(y.shape)

(93539, 3584)
(93539,)


In [13]:
%%time
param_grid = {'n_estimators': [100],
              'min_samples_leaf': [100],
              'max_depth': [10,20,30]
              }

clf = GridSearchCV(
        RandomForestClassifier(n_jobs=12),param_grid,scoring='average_precision')
clf.fit(X, y)                 


CPU times: user 27min 49s, sys: 23 s, total: 28min 12s
Wall time: 31min 32s


GridSearchCV(estimator=RandomForestClassifier(n_jobs=12),
             param_grid={'max_depth': [10, 20, 30], 'min_samples_leaf': [100],
                         'n_estimators': [100]},
             scoring='average_precision')

In [14]:
clf.cv_results_

{'mean_fit_time': array([ 78.32106948, 123.61873899, 147.31787372]),
 'std_fit_time': array([5.38901323, 6.8204253 , 3.11310523]),
 'mean_score_time': array([0.24950681, 0.32321181, 0.31185203]),
 'std_score_time': array([0.04327643, 0.08382156, 0.05364636]),
 'param_max_depth': masked_array(data=[10, 20, 30],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[100, 100, 100],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 100, 100],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 10, 'min_samples_leaf': 100, 'n_estimators': 100},
  {'max_depth': 20, 'min_samples_leaf': 100, 'n_estimators': 100},
  {'max_depth': 30, 'min_samples_leaf': 100, 'n_estimators': 100}],
 'split0_test_score': array([0.24337825, 0.24777856, 0.2536017 ]),
 'split

In [17]:
best_model = clf.best_estimator_

In [18]:
output_dir = '/home/jolteon/eluvio_challenge/RF_difference_of_features/'
generate_predictions_dir_RF(best_model,test_dir,output_dir)

# Check how we did

In [21]:
!python3 make_baseline.py

Mean Transition Percent is: 0.07655792141298438


### Bernoulli random guess for each prediction based on the mean number of boundaries

In [18]:
!python3 evaluate_sceneseg.py baseline_random/

# of IMDB IDs: 8
Scores: {
    "AP": 0.0903779250550157,
    "mAP": 0.09798375588604646,
    "Miou": 0.30419257642839564,
    "Precision": 0.0974340223226487,
    "Recall": 0.07676671243899885,
    "F1": 0.08329692954096599
}


### Predictions provided by Eluvio

In [17]:
!python3 evaluate_sceneseg.py baseline_preliminary/

# of IMDB IDs: 8
Scores: {
    "AP": 0.4799354967433886,
    "mAP": 0.49328420987394367,
    "Miou": 0.4797450602748557,
    "Precision": 0.3380879775551314,
    "Recall": 0.6925031762816138,
    "F1": 0.44427621620186347
}


### Logistic Regression, Dot product of features

In [19]:
!python3 evaluate_sceneseg.py logistic_dot_product/

# of IMDB IDs: 8
  precision = tps / tp_fp.sum()
  fscore_dict[imdb_id] = 2 * p * r / (p + r)
Scores: {
    "AP": 0.18439897955928014,
    "mAP": 0.22990494989628843,
    "Miou": 0.03214543822808406,
    "Precision": 0.0,
    "Recall": 0.0,
    "F1": NaN
}


### Random Forest, difference of features

In [20]:
!python3 evaluate_sceneseg.py RF_difference_of_features/

# of IMDB IDs: 8
  precision = tps / tp_fp.sum()
  fscore_dict[imdb_id] = 2 * p * r / (p + r)
Scores: {
    "AP": 0.5492763052324752,
    "mAP": 0.5733759006116512,
    "Miou": 0.03214543822808406,
    "Precision": 0.0,
    "Recall": 0.0,
    "F1": NaN
}
